# CLASSIFICATION OF REASON FOR CLOSURE

In [103]:
import numpy as np
import pandas as pd
import math
import nltk
import re
import string
import textstat
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

## READING TEST DATA

In [104]:
demoData=pd.read_csv("demoData2.csv")

In [105]:
demoData.shape

(400, 18)

In [106]:
demoData.columns

Index(['Unnamed: 0', 'PostId', 'UserId', 'Comment', 'Body', 'Title', 'Tags',
       'Reputation', 'experienceInTime', 'WebsiteUrl', 'Location', 'AboutMe',
       'Views', 'Upvotes', 'Downvotes', 'ProfileImageUrl', 'tagCount',
       'closed'],
      dtype='object')

In [107]:
demoData.head()

Unnamed: 0.1,Unnamed: 0,PostId,UserId,Comment,Body,Title,Tags,Reputation,experienceInTime,WebsiteUrl,Location,AboutMe,Views,Upvotes,Downvotes,ProfileImageUrl,tagCount,closed
0,34467,49779049,8875256,103,<p>I have the below table. I only want to retu...,Return rolling 12 months of current (string) m...,<sql><oracle>,85,160,,,,64,21,0,https://www.gravatar.com/avatar/648315974a4a6f...,2,1
1,55833,38345565,6108368,104,<p>I want to read ticker info without authenti...,How to get bitfinex ticker with c#,<c#><api><ticker>,34,111,,,,7,5,0,https://lh3.googleusercontent.com/-hOCKrh-IeAg...,3,1
2,73244,4117463,493325,105,<p>What is the best way to display results and...,What is the best way to display results and da...,<php><database><list><report>,339,6,,,<p>I am a simple youngman</p>\r\r\r\r\r\r\r\n\...,298,22,1,,4,1
3,62520,25799820,3622907,105,<p>Suppose it's this code:</p>\r\r\r\r\r\r\r\n...,"Is it okay to add ""transition: all"" when in fa...",<css>,196,125,,"Kolkata, India",,152,8,0,https://i.stack.imgur.com/hrHz7.jpg,1,1
4,18326,6825778,529215,102,"<p>Using Java Android, I am trying to find a w...",Face recognition API,<java><android><face-recognition>,1944,235,https://www.linkedin.com/profile/view?id=56518...,"Colombo, Sri Lanka",<p>Open source developer at WSO2.</p>\r\r\r\r\...,455,695,43,,3,1


## FEATURE ENGINEERING

In [108]:
#Credits: https://stackoverflow.com/questions/4576077/python-split-text-on-sentences

alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])" 

def split_into_sentences(text):
    #text=body
    text=striphtml(text)
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    #print(text)
    return sentences

In [109]:
#METHODS FOR FEATURE CREATION

#Removes all occurence of <code>    
def removeCode(text):
    #make sure you don't strip HTML beforehand
    textWithoutCode=re.sub(r"<code>(.*?)</code>","",text,flags= re.DOTALL)
    #print(codes)
    return textWithoutCode

#Removes HTML from query data but doesn't remove the content bw opening and closing tag
#Attributes are removed eg:<a href="this is all removed">not removed</a>
#https://stackoverflow.com/questions/3398852/using-python-remove-html-tags-formatting-from-a-string
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)
  
    
#Returns number of lowercase chars in the text
def lowercaseCount(text):
    text=removeCode(text)
    text=striphtml(text)
    lowercount=0
    try:
        for w in text:
            for char in w:
                if(char.islower()):
                    lowercount+=1
        return lowercount
    except:
        #print(type(text))
        print(text)

#Returns number of uppercase chars in the text
def uppercaseCount(text):
    text=removeCode(text)
    text=striphtml(text)
    uppercount=0
    try:    
        for w in text:
            for char in w:
                if(char.isupper()):
                    uppercount+=1
        return uppercount
    except:
        print(text)
    
    
#Count of sentences in the post body
def getSentenceCount(text):
    text=removeCode(text)
    return len(split_into_sentences(text))

def getFirstLineLength(text):
    text=removeCode(text)
    sentences=split_into_sentences(text)
    #print(len(sentences))
    if(len(sentences)==0):
        #print(text)
        return 0
    firstLine=sentences[0]
    return len(firstLine)
 
#Returns the length of total code present in the post
def codeLength(text):
    #make sure you don't strip HTML beforehand
    codes=re.findall(r"<code>(.*?)</code>",text,flags= re.DOTALL)
    #print(codes)
    return len(''.join(codes))

#https://stackoverflow.com/questions/6883049/regex-to-extract-urls-from-href-attribute-in-html-with-python
#https://stackoverflow.com/questions/1374457/find-out-how-many-times-a-regex-matches-in-a-string-in-python
#Returns count of url in a given post
def urlCount(text):
    urls=re.findall(r'https://?|ftp://',text)
    #print(urls)
    return len(urls)


#Returns count of URls pointing to Stack Overflow 
def SOUrlCount(text):
    SOUrls=re.findall(r'https://stackoverflow.com',text)
    #print(SOUrls)
    return len(SOUrls)
       
    
def countOfInterrogativeSent(text):
    text=text.lower()
    #tokens=text.split() #1D list of words 
    interrogatives=re.findall(r'\b[a-z]*[^.!][?]',text)
    return len(interrogatives)
    
    

def sentencesStartWithYouCount(text):
    count=0
    text=text.lower()
    #text=re.sub('<.*>','',text)
    text=removeCode(text)
    text=striphtml(text)
    sentences=split_into_sentences(text)
    for sentence in sentences:
        tokens=re.split('[\s;\',.\-\%]',sentence)
        #print(tokens)
        if(tokens[0]=="you"):
            #print("token0="+tokens[0])
            count+=1
    return count;

def sentencesStartWithICount(text):
    count=0
    text=text.lower()
    text=removeCode(text)
    text=striphtml(text)
    sentences=split_into_sentences(text)
    for sentence in sentences:
        tokens=re.split('[\s;\',.\-\%]',sentence)
        #print(tokens)
        if(tokens[0]=="i"):
            #print("token0="+tokens[0])
            count+=1
        if(len(tokens)>=2): #But I don't have, So I switched 
            if(tokens[1]=="i"):
                count+=1
        if(len(tokens)>=3):
            if(tokens[2]=="i"): #As such I had amended...
                count+=1
    return count;


#https://stackoverflow.com/questions/12628958/remove-small-words-using-python
def countShortWords(text):
    text=removeCode(text)
    text=striphtml(text)
    shortwords = re.findall(r'\W*\b\w{1,3}\b',text)
    #print(shortwords)
    return len(shortwords)

   
#Returns count of total number of words in the query
#https://stackoverflow.com/questions/19410018/how-to-count-the-number-of-words-in-a-sentence-ignoring-numbers-punctuation-an
def countWords(text):
    text=removeCode(text)
    text=striphtml(text)
    return len(text.split())


#Returns count of punctuations in the post body
#https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
def countPunctuations(text):
    #remove all code in the post
    text=re.sub('<code>.*?</code>','',text)
    
    text=striphtml(text)  #will also remove all URLs as URL is mentioned in href=" " attribute
    puncs=re.findall(r'[^\w\s]',text)
    #print(puncs)
    return len(puncs)

def makeBinary(text):
    if(pd.isnull(text)):
        return 0
    return 1


In [110]:
#Adding necessary feature cJolumns
#https://stackoverflow.com/questions/40045632/adding-a-column-in-pandas-df-using-a-function

demoData['lowerUpperRatio']=(demoData['Body'].apply(lowercaseCount))/(demoData['Body'].apply(uppercaseCount))

demoData['sentenceCount']=demoData['Body'].apply(getSentenceCount)

demoData['firstLineLength']=demoData['Body'].apply(getFirstLineLength)

demoData['codeLength']=demoData['Body'].apply(codeLength)

demoData['urlCount']=demoData['Body'].apply(urlCount)

demoData['SOUrlCount']=demoData['Body'].apply(SOUrlCount)

demoData['titleLengthInChars']=demoData['Title'].apply(len)

demoData['countOfInterrogativeSent']=demoData['Body'].apply(countOfInterrogativeSent)

demoData['sentencesStartWithYouCount']=demoData['Body'].apply(sentencesStartWithYouCount)
demoData['sentencesStartWithICount']=demoData['Body'].apply(sentencesStartWithICount)

demoData['shortWordCount']=demoData['Body'].apply(countShortWords)
demoData['bodyWordCount']=demoData['Body'].apply(countWords)
demoData['punctuationCount']=demoData['Body'].apply(countPunctuations)

demoData['websiteUrlFilled']=demoData['WebsiteUrl'].apply(makeBinary)
demoData['locationFilled']=demoData['Location'].apply(makeBinary) 
demoData['aboutMeFilled']=demoData['AboutMe'].apply(makeBinary)
demoData['profileImageUrlFilled']=demoData['ProfileImageUrl'].apply(makeBinary)

In [111]:
def gunningFog(text):
    text=removeCode(text)
    text=striphtml(text)
    return textstat.gunning_fog(text)

def fleschReadingEase(text):
    text=removeCode(text)
    text=striphtml(text)
    return textstat.flesch_reading_ease(text)

def daleChallReadabilityScore(text):
    text=removeCode(text)
    text=striphtml(text)
    return textstat.dale_chall_readability_score(text)


def smogIndex(text):
    text=removeCode(text)
    text=striphtml(text)
    return textstat.smog_index(text)

def colemanLiauIndex(text):
    text=removeCode(text)
    text=striphtml(text)
    return textstat.coleman_liau_index(text)


In [112]:
#https://en.wikipedia.org/wiki/Gunning_fog_index
demoData['gunningFog']=demoData['Body'].apply(gunningFog)
#https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
demoData['fleschReadingEase']=demoData['Body'].apply(fleschReadingEase)
#https://en.wikipedia.org/wiki/Dale%E2%80%93Chall_readability_formula
demoData['daleChallReadabilityScore']=demoData['Body'].apply(daleChallReadabilityScore)
#https://en.wikipedia.org/wiki/SMOG
demoData['smogIndex']=demoData['Body'].apply(smogIndex)
#https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
demoData['colemanLiauIndex']=demoData['Body'].apply(colemanLiauIndex)

In [113]:
demoData['userProfileFilled']=demoData['websiteUrlFilled']+demoData['locationFilled']+demoData['aboutMeFilled']+demoData['profileImageUrlFilled']
print(demoData.head())

   Unnamed: 0    PostId   UserId  Comment  \
0       34467  49779049  8875256      103   
1       55833  38345565  6108368      104   
2       73244   4117463   493325      105   
3       62520  25799820  3622907      105   
4       18326   6825778   529215      102   

                                                Body  \
0  <p>I have the below table. I only want to retu...   
1  <p>I want to read ticker info without authenti...   
2  <p>What is the best way to display results and...   
3  <p>Suppose it's this code:</p>\r\r\r\r\r\r\r\n...   
4  <p>Using Java Android, I am trying to find a w...   

                                               Title  \
0  Return rolling 12 months of current (string) m...   
1                 How to get bitfinex ticker with c#   
2  What is the best way to display results and da...   
3  Is it okay to add "transition: all" when in fa...   
4                               Face recognition API   

                                Tags  Reputation  exper

### FROM 18 Columns to 41 Columns

In [114]:
demoData.columns

Index(['Unnamed: 0', 'PostId', 'UserId', 'Comment', 'Body', 'Title', 'Tags',
       'Reputation', 'experienceInTime', 'WebsiteUrl', 'Location', 'AboutMe',
       'Views', 'Upvotes', 'Downvotes', 'ProfileImageUrl', 'tagCount',
       'closed', 'lowerUpperRatio', 'sentenceCount', 'firstLineLength',
       'codeLength', 'urlCount', 'SOUrlCount', 'titleLengthInChars',
       'countOfInterrogativeSent', 'sentencesStartWithYouCount',
       'sentencesStartWithICount', 'shortWordCount', 'bodyWordCount',
       'punctuationCount', 'websiteUrlFilled', 'locationFilled',
       'aboutMeFilled', 'profileImageUrlFilled', 'gunningFog',
       'fleschReadingEase', 'daleChallReadabilityScore', 'smogIndex',
       'colemanLiauIndex', 'userProfileFilled'],
      dtype='object')

In [115]:
demoData.shape

(400, 41)

In [116]:
X = demoData.loc[:,['Comment', 'Reputation',
       'experienceInTime', 'Views',
       'Upvotes', 'Downvotes', 'tagCount',
       'lowerUpperRatio', 'sentenceCount', 'firstLineLength', 'codeLength',
       'urlCount', 'SOUrlCount', 'titleLengthInChars',
       'countOfInterrogativeSent', 'sentencesStartWithYouCount',
       'sentencesStartWithICount', 'shortWordCount', 'bodyWordCount',
       'punctuationCount', 'gunningFog',
       'fleschReadingEase', 'daleChallReadabilityScore', 'smogIndex',
       'colemanLiauIndex', 'userProfileFilled'] ]

X.replace(np.nan,0,inplace=True)  #replacing NaN to 0
X.lowerUpperRatio.replace(np.inf,0,inplace=True)  #replacing infinity to 0

In [117]:
y = X.Comment-102
X.drop(['Comment'],inplace=True,axis=1)
print("Shape of X: "+str(X.shape))
print("Shape of y: "+str(y.shape))

Shape of X: (400, 25)
Shape of y: (400,)


## FEATURE SCALING

In [118]:
#(x-u)/std

mean=[2.35523230e+03, 4.34132119e+02, 3.40874919e+02, 2.26129319e+02,
 2.12335492e+01, 2.79326466e+00, 3.14914157e+01, 5.35521776e+00,
 1.00150008e+02, 3.59061591e+02, 2.09772989e-01, 2.95014751e-02,
 4.99136832e+01, 1.21304815e+00, 9.32546627e-03, 1.79547727e+00,
 3.77248362e+01, 7.93895445e+01, 7.07764763e+01, 2.13824006e+01,
 5.07359859e+01, 7.99246512e+00, 4.25646657e+00, 1.13319457e+01,
 1.53087654e+00]

var=[1.66741102e+08, 3.23248073e+05, 3.99683595e+06, 5.91791687e+05,
 1.47856693e+05, 1.51101319e+00, 7.36299857e+02, 5.71612347e+01,
 1.24934731e+04, 1.48916971e+06, 6.45592273e-01, 5.61825156e-02,
 4.11095792e+02, 2.71061078e+00, 1.26386720e-02, 2.59538982e+00,
 1.17190623e+03, 5.05811079e+03, 3.77168621e+04, 1.85683474e+02,
 1.99780688e+03, 4.70088956e+00, 3.29718449e+01, 3.54714047e+01,
 1.22439541e+00]

print(X.head())
X=(X-mean)/np.sqrt(var)
print(X.head())

   Reputation  experienceInTime  Views  Upvotes  Downvotes  tagCount  \
0          85               160     64       21          0         2   
1          34               111      7        5          0         3   
2         339                 6    298       22          1         4   
3         196               125    152        8          0         1   
4        1944               235    455      695         43         3   

   lowerUpperRatio  sentenceCount  firstLineLength  codeLength  ...  \
0        19.833333              3               23         193  ...   
1        30.600000              3               72           0  ...   
2         8.838710              4               94           0  ...   
3        30.400000              2              142         164  ...   
4        12.333333              3              102           0  ...   

   sentencesStartWithICount  shortWordCount  bodyWordCount  punctuationCount  \
0                         2              12             31  

In [119]:
print(mean)
print(np.sqrt(var))

[2355.2323, 434.132119, 340.874919, 226.129319, 21.2335492, 2.79326466, 31.4914157, 5.35521776, 100.150008, 359.061591, 0.209772989, 0.0295014751, 49.9136832, 1.21304815, 0.00932546627, 1.79547727, 37.7248362, 79.3895445, 70.7764763, 21.3824006, 50.7359859, 7.99246512, 4.25646657, 11.3319457, 1.53087654]
[1.29128270e+04 5.68549095e+02 1.99920883e+03 7.69279980e+02
 3.84521382e+02 1.22923276e+00 2.71348458e+01 7.56050492e+00
 1.11774206e+02 1.22031541e+03 8.03487569e-01 2.37028512e-01
 2.02754973e+01 1.64639326e+00 1.12421848e-01 1.61102136e+00
 3.42331160e+01 7.11203964e+01 1.94208296e+02 1.36265723e+01
 4.46968330e+01 2.16815349e+00 5.74211154e+00 5.95578750e+00
 1.10652402e+00]


## IMPORTING PRETRAINED MODEL

In [120]:
from joblib import dump, load
svm=load('rbf_svm_2.joblib') 

In [121]:
print(svm)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


## PREDICTION

In [122]:
y_pred=svm.predict(X)
print("Actual Predicted")
for i in range(0,25):
    print(str(y[i])+"\t"+str(y_pred[i]))

Actual Predicted
1	1
2	2
3	3
3	1
0	3
2	2
3	0
1	1
2	1
2	2
0	1
0	2
1	1
3	3
1	1
0	1
2	0
1	1
3	3
0	0
0	1
2	2
2	2
2	2
0	0


## MODEL PERFORMANCE

In [123]:
print("Accuracy: "+str(np.mean(y_pred==y)))

Accuracy: 0.4575


In [124]:
# wholeData=pd.read_csv("openCloseData8.csv")
# wholeData=wholeData[wholeData['closed']==1]
# print(wholeData.shape)
# sample=wholeData.sample(n=400)
# print(sample.shape)
# sample.to_csv("demoData2.csv")