# CLASSIFICATION OF OPEN/CLOSED QUESTIONS

In [1]:
import numpy as np
import pandas as pd
import math
import nltk
import re
import string
import textstat
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

## READING TEST DATA

In [2]:
demoData=pd.read_csv("demoData.csv")

In [3]:
demoData.shape

(100, 18)

In [4]:
demoData.columns

Index(['Unnamed: 0', 'PostId', 'UserId', 'Comment', 'Body', 'Title', 'Tags',
       'Reputation', 'experienceInTime', 'WebsiteUrl', 'Location', 'AboutMe',
       'Views', 'Upvotes', 'Downvotes', 'ProfileImageUrl', 'tagCount',
       'closed'],
      dtype='object')

In [5]:
demoData.head()

Unnamed: 0.1,Unnamed: 0,PostId,UserId,Comment,Body,Title,Tags,Reputation,experienceInTime,WebsiteUrl,Location,AboutMe,Views,Upvotes,Downvotes,ProfileImageUrl,tagCount,closed
0,111955,35235079,487328,100,<p>Need to URL encode each character only if i...,Linq Expression for DataTable to URLEncoding o...,c#|string|linq|urlencode|unicode-string,1250,1928,http://pword.sourceforge.net/,Los Angeles,"<p>Development includes c#, asp, asp.net, mssq...",438,551,19,,5,0
1,24281,32353618,2710873,103,"<p>I have installed <a href=""http://www.aerosp...",How to setup and configure aerospike cluster o...,<aerospike>,2959,740,http://selenium4u.blogspot.in/,Pune India,,198,25,3,https://i.stack.imgur.com/RoOLU.jpg,1,1
2,114830,31613945,4190459,100,<p>I'm trying to <code>aggregate</code> a data...,aggregate means and keep N,r|aggregate,667,269,,,,56,90,6,https://www.gravatar.com/avatar/cdfcba0dd40486...,2,0
3,102141,35690239,2310562,100,<p>I'm using One Page Nav Plugin (<a href= htt...,One Page navigation animation,jquery|css|scroll|navigation,337,1042,http://adamwojda.me,Lubawka,,41,33,1,,4,0
4,39962,35958399,5282634,103,<p>I have one of my own custom projects that i...,Maven: How to use a dependency that doesn't ha...,<java><maven><dependencies>,7,195,,,,1,0,0,https://www.gravatar.com/avatar/903bd89d103e0f...,3,1


## FEATURE ENGINEERING

In [6]:
#Credits: https://stackoverflow.com/questions/4576077/python-split-text-on-sentences

alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])" 

def split_into_sentences(text):
    #text=body
    text=striphtml(text)
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    #print(text)
    return sentences

In [7]:
#METHODS FOR FEATURE CREATION

#Removes all occurence of <code>    
def removeCode(text):
    #make sure you don't strip HTML beforehand
    textWithoutCode=re.sub(r"<code>(.*?)</code>","",text,flags= re.DOTALL)
    #print(codes)
    return textWithoutCode

#Removes HTML from query data but doesn't remove the content bw opening and closing tag
#Attributes are removed eg:<a href="this is all removed">not removed</a>
#https://stackoverflow.com/questions/3398852/using-python-remove-html-tags-formatting-from-a-string
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)
  
    
#Returns number of lowercase chars in the text
def lowercaseCount(text):
    text=removeCode(text)
    text=striphtml(text)
    lowercount=0
    try:
        for w in text:
            for char in w:
                if(char.islower()):
                    lowercount+=1
        return lowercount
    except:
        #print(type(text))
        print(text)

#Returns number of uppercase chars in the text
def uppercaseCount(text):
    text=removeCode(text)
    text=striphtml(text)
    uppercount=0
    try:    
        for w in text:
            for char in w:
                if(char.isupper()):
                    uppercount+=1
        return uppercount
    except:
        print(text)
    
    
#Count of sentences in the post body
def getSentenceCount(text):
    text=removeCode(text)
    return len(split_into_sentences(text))

def getFirstLineLength(text):
    text=removeCode(text)
    sentences=split_into_sentences(text)
    #print(len(sentences))
    if(len(sentences)==0):
        #print(text)
        return 0
    firstLine=sentences[0]
    return len(firstLine)
 
#Returns the length of total code present in the post
def codeLength(text):
    #make sure you don't strip HTML beforehand
    codes=re.findall(r"<code>(.*?)</code>",text,flags= re.DOTALL)
    #print(codes)
    return len(''.join(codes))

#https://stackoverflow.com/questions/6883049/regex-to-extract-urls-from-href-attribute-in-html-with-python
#https://stackoverflow.com/questions/1374457/find-out-how-many-times-a-regex-matches-in-a-string-in-python
#Returns count of url in a given post
def urlCount(text):
    urls=re.findall(r'https://?|ftp://',text)
    #print(urls)
    return len(urls)


#Returns count of URls pointing to Stack Overflow 
def SOUrlCount(text):
    SOUrls=re.findall(r'https://stackoverflow.com',text)
    #print(SOUrls)
    return len(SOUrls)
       
    
def countOfInterrogativeSent(text):
    text=text.lower()
    #tokens=text.split() #1D list of words 
    interrogatives=re.findall(r'\b[a-z]*[^.!][?]',text)
    return len(interrogatives)
    
    

def sentencesStartWithYouCount(text):
    count=0
    text=text.lower()
    #text=re.sub('<.*>','',text)
    text=removeCode(text)
    text=striphtml(text)
    sentences=split_into_sentences(text)
    for sentence in sentences:
        tokens=re.split('[\s;\',.\-\%]',sentence)
        #print(tokens)
        if(tokens[0]=="you"):
            #print("token0="+tokens[0])
            count+=1
    return count;

def sentencesStartWithICount(text):
    count=0
    text=text.lower()
    text=removeCode(text)
    text=striphtml(text)
    sentences=split_into_sentences(text)
    for sentence in sentences:
        tokens=re.split('[\s;\',.\-\%]',sentence)
        #print(tokens)
        if(tokens[0]=="i"):
            #print("token0="+tokens[0])
            count+=1
        if(len(tokens)>=2): #But I don't have, So I switched 
            if(tokens[1]=="i"):
                count+=1
        if(len(tokens)>=3):
            if(tokens[2]=="i"): #As such I had amended...
                count+=1
    return count;


#https://stackoverflow.com/questions/12628958/remove-small-words-using-python
def countShortWords(text):
    text=removeCode(text)
    text=striphtml(text)
    shortwords = re.findall(r'\W*\b\w{1,3}\b',text)
    #print(shortwords)
    return len(shortwords)

   
#Returns count of total number of words in the query
#https://stackoverflow.com/questions/19410018/how-to-count-the-number-of-words-in-a-sentence-ignoring-numbers-punctuation-an
def countWords(text):
    text=removeCode(text)
    text=striphtml(text)
    return len(text.split())


#Returns count of punctuations in the post body
#https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
def countPunctuations(text):
    #remove all code in the post
    text=re.sub('<code>.*?</code>','',text)
    
    text=striphtml(text)  #will also remove all URLs as URL is mentioned in href=" " attribute
    puncs=re.findall(r'[^\w\s]',text)
    #print(puncs)
    return len(puncs)

def makeBinary(text):
    if(pd.isnull(text)):
        return 0
    return 1


In [8]:
#Adding necessary feature cJolumns
#https://stackoverflow.com/questions/40045632/adding-a-column-in-pandas-df-using-a-function

demoData['lowerUpperRatio']=(demoData['Body'].apply(lowercaseCount))/(demoData['Body'].apply(uppercaseCount))

demoData['sentenceCount']=demoData['Body'].apply(getSentenceCount)

demoData['firstLineLength']=demoData['Body'].apply(getFirstLineLength)

demoData['codeLength']=demoData['Body'].apply(codeLength)

demoData['urlCount']=demoData['Body'].apply(urlCount)

demoData['SOUrlCount']=demoData['Body'].apply(SOUrlCount)

demoData['titleLengthInChars']=demoData['Title'].apply(len)

demoData['countOfInterrogativeSent']=demoData['Body'].apply(countOfInterrogativeSent)

demoData['sentencesStartWithYouCount']=demoData['Body'].apply(sentencesStartWithYouCount)
demoData['sentencesStartWithICount']=demoData['Body'].apply(sentencesStartWithICount)

demoData['shortWordCount']=demoData['Body'].apply(countShortWords)
demoData['bodyWordCount']=demoData['Body'].apply(countWords)
demoData['punctuationCount']=demoData['Body'].apply(countPunctuations)

demoData['websiteUrlFilled']=demoData['WebsiteUrl'].apply(makeBinary)
demoData['locationFilled']=demoData['Location'].apply(makeBinary) 
demoData['aboutMeFilled']=demoData['AboutMe'].apply(makeBinary)
demoData['profileImageUrlFilled']=demoData['ProfileImageUrl'].apply(makeBinary)

In [9]:
tagWeightData=pd.read_csv("tagWeightData.csv")
#print(tagWeightData.head())

In [10]:
idx=0
def calculateTagWeight(tags):
    forClosed=re.findall('<(.*?)>',tags)
    forOpen=tags.split('|')
    #print(forClosed)
    #print(forOpen)
    if(len(forClosed) ):  #if we pass tags for open in close it won't work, but vice versa will work hence 
        # this particular order of if-else 
        tagList=forClosed
    elif(len(forOpen) ):
        tagList=forOpen
    #print(tagList)
    wt=0
#     for t in tagList:
#         #print(t)
#         try:
            
#         except:
#             print(t)
#         print(wt)
#     print("check")
#     if(idx%1000==0)print(idx)
#     idx+=1
    try:
        for t in tagList:
            wt=wt+tagWeightData[tagWeightData['TagName']==t][['tagWeight']].values[0]
    except:
        #print(tagList)
        pass
    return wt

In [11]:
demoData['tagWeightSum']=demoData['Tags'].apply(calculateTagWeight) #tagClosingWeightSum

In [12]:
def gunningFog(text):
    text=removeCode(text)
    text=striphtml(text)
    return textstat.gunning_fog(text)

def fleschReadingEase(text):
    text=removeCode(text)
    text=striphtml(text)
    return textstat.flesch_reading_ease(text)

def daleChallReadabilityScore(text):
    text=removeCode(text)
    text=striphtml(text)
    return textstat.dale_chall_readability_score(text)


def smogIndex(text):
    text=removeCode(text)
    text=striphtml(text)
    return textstat.smog_index(text)

def colemanLiauIndex(text):
    text=removeCode(text)
    text=striphtml(text)
    return textstat.coleman_liau_index(text)


In [13]:
#https://en.wikipedia.org/wiki/Gunning_fog_index
demoData['gunningFog']=demoData['Body'].apply(gunningFog)
#https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
demoData['fleschReadingEase']=demoData['Body'].apply(fleschReadingEase)
#https://en.wikipedia.org/wiki/Dale%E2%80%93Chall_readability_formula
demoData['daleChallReadabilityScore']=demoData['Body'].apply(daleChallReadabilityScore)
#https://en.wikipedia.org/wiki/SMOG
demoData['smogIndex']=demoData['Body'].apply(smogIndex)
#https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
demoData['colemanLiauIndex']=demoData['Body'].apply(colemanLiauIndex)

In [14]:
demoData['userProfileFilled']=demoData['websiteUrlFilled']+demoData['locationFilled']+demoData['aboutMeFilled']+demoData['profileImageUrlFilled']
print(demoData.head())

   Unnamed: 0    PostId   UserId  Comment  \
0      111955  35235079   487328      100   
1       24281  32353618  2710873      103   
2      114830  31613945  4190459      100   
3      102141  35690239  2310562      100   
4       39962  35958399  5282634      103   

                                                Body  \
0  <p>Need to URL encode each character only if i...   
1  <p>I have installed <a href="http://www.aerosp...   
2  <p>I'm trying to <code>aggregate</code> a data...   
3  <p>I'm using One Page Nav Plugin (<a href= htt...   
4  <p>I have one of my own custom projects that i...   

                                               Title  \
0  Linq Expression for DataTable to URLEncoding o...   
1  How to setup and configure aerospike cluster o...   
2                         aggregate means and keep N   
3                      One Page navigation animation   
4  Maven: How to use a dependency that doesn't ha...   

                                      Tags  Reputation 

### FROM 18 Columns to 42 Columns

In [15]:
demoData.columns

Index(['Unnamed: 0', 'PostId', 'UserId', 'Comment', 'Body', 'Title', 'Tags',
       'Reputation', 'experienceInTime', 'WebsiteUrl', 'Location', 'AboutMe',
       'Views', 'Upvotes', 'Downvotes', 'ProfileImageUrl', 'tagCount',
       'closed', 'lowerUpperRatio', 'sentenceCount', 'firstLineLength',
       'codeLength', 'urlCount', 'SOUrlCount', 'titleLengthInChars',
       'countOfInterrogativeSent', 'sentencesStartWithYouCount',
       'sentencesStartWithICount', 'shortWordCount', 'bodyWordCount',
       'punctuationCount', 'websiteUrlFilled', 'locationFilled',
       'aboutMeFilled', 'profileImageUrlFilled', 'tagWeightSum', 'gunningFog',
       'fleschReadingEase', 'daleChallReadabilityScore', 'smogIndex',
       'colemanLiauIndex', 'userProfileFilled'],
      dtype='object')

In [16]:
demoData.shape

(100, 42)

In [17]:
X = demoData.loc[:,['Reputation',
       'experienceInTime', 'Views',
       'Upvotes', 'Downvotes', 'tagCount', 'closed',
       'lowerUpperRatio', 'sentenceCount', 'firstLineLength', 'codeLength',
       'urlCount', 'SOUrlCount', 'titleLengthInChars',
       'countOfInterrogativeSent', 'sentencesStartWithYouCount',
       'sentencesStartWithICount', 'shortWordCount', 'bodyWordCount',
       'punctuationCount', 'tagWeightSum', 'gunningFog',
       'fleschReadingEase', 'daleChallReadabilityScore', 'smogIndex',
       'colemanLiauIndex', 'userProfileFilled'] ]

In [18]:
y = X.closed
X.drop(['closed'],inplace=True,axis=1)
print("Shape of X: "+str(X.shape))
print("Shape of y: "+str(y.shape))

Shape of X: (100, 26)
Shape of y: (100,)


## FEATURE SCALING

In [19]:
#(x-u)/std
mean=[ 2.01807416e+03, 5.10066908e+02 , 3.01059583e+02 , 2.26216980e+02,
  2.18902723e+01,  2.90577889e+00,  2.99731747e+01,  5.96079902e+00,
  9.98289770e+01,  5.68528057e+02,  1.91942299e-01,  1.54941374e-02,
  5.12481187e+01,  1.19541739e+00,  8.63771594e-03,  2.02549439e+00,
  4.09886685e+01,  8.53872034e+01,  4.33703530e+01, -2.44966593e+02,
  1.89378949e+01,  5.42152991e+01,  7.71907479e+00,  5.76047464e+00,
  9.82247931e+00,  1.54041351e+00]

var=[1.17173655e+08,3.42615432e+05, 2.94724961e+06, 6.03714126e+05,
 1.11587341e+05, 1.50461647e+00, 6.64480756e+02, 1.22872198e+02,
 8.75204965e+03, 2.07884824e+06, 8.66455736e-01, 2.93294209e-02,
 4.03492388e+02, 4.61051576e+00, 1.12006717e-02, 2.81215161e+00,
 1.21652922e+03, 4.70156550e+03, 2.03530533e+04, 4.59393389e+04,
 1.23125524e+02, 1.37349155e+03, 3.37276468e+00, 3.21043352e+01,
 3.34948687e+01, 1.23490391e+00]

print(X.head())
X=(X-mean)/np.sqrt(var)
print(X.head())

   Reputation  experienceInTime  Views  Upvotes  Downvotes  tagCount  \
0        1250              1928    438      551         19         5   
1        2959               740    198       25          3         1   
2         667               269     56       90          6         2   
3         337              1042     41       33          1         4   
4           7               195      1        0          0         3   

   lowerUpperRatio  sentenceCount  firstLineLength  codeLength  ...  \
0        16.181818             10              215         612  ...   
1        19.400000              2              111           0  ...   
2        20.111111              1               76         132  ...   
3        19.928571              1              113         294  ...   
4        16.363636              3               52           0  ...   

   shortWordCount  bodyWordCount  punctuationCount           tagWeightSum  \
0              59            117                16    [-387.475

In [20]:
# print(mean)
# print(np.sqrt(var))

## IMPORTING PRETRAINED MODEL

In [21]:
from joblib import dump, load
logreg=load('logreg1.joblib') 

In [22]:
print(logreg)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


## PREDICTION

In [23]:
y_pred=logreg.predict(X)
print("Actual Predicted")
for i in range(0,25):
    print(str(y[i])+"\t"+str(y_pred[i]))

Actual Predicted
0	0
1	1
0	0
0	0
1	1
1	1
0	0
0	0
0	0
0	0
1	1
1	1
0	0
0	0
1	1
1	1
1	1
1	1
1	1
1	1
0	0
1	1
0	1
0	0
1	1


## MODEL PERFORMANCE

In [24]:
print("Accuracy: "+str(np.mean(y_pred==y)))
print("Precision: "+str(precision_score(y, y_pred, average='macro')) )
print("Recall: "+str(recall_score(y, y_pred, average='macro')) )
print("F1 Score:"+str(f1_score(y, y_pred, average='macro')) )

Accuracy: 0.9
Precision: 0.900974025974026
Recall: 0.8977455716586151
F1 Score:0.898989898989899


In [25]:
# wholeData=pd.read_csv("openCloseData8.csv")
# wholeData.shape
# sample=wholeData.sample(n=100)
# sample.shape
# sample.to_csv("demoData.csv")