In [96]:
import matplotlib as plt
import numpy as np
import pandas as pd
import math
import nltk
import re
import string

### Features for classifiers

#### Query Content:  
Count of lowercase chars in body  
Count of uppercase chars in body  
lowercase to uppercase ratio in body  
character count in body  
word count in body  
code length in body  
length of first line in body   
punctuations count  
sentence count in body  
count of sentences starting with you  
count of sentences starting with i  
count of interrogative words  
count of URLs in body  
smiley count in body  
count of short words in body  
count of punctuations

character count in title  
word count in title  
title starting with what/which/how (useful for opinion based questions) or use interrogative words check


#### User profile
Reputation  
Creation Date  
View count  
upvotes received  
downvotes received  
user Badges  
about me filled  
website url filled  
location filled  
profile image url filled  
last access date  
questions asked by user  
answers posted by user  
Previous Questions with -ve score  
Previous Questions with +ve score  
Previous Questions with 0 score  
Previous Answers with -ve score  
Previous Answers with +ve score  
Previous Answers with 0 score  

In [98]:
#Resources for text cleaning: 
#https://machinelearningmastery.com/clean-text-machine-learning-python/
#https://github.com/SudalaiRajkumar/NLP/blob/master/src/Text%20Cleaning.ipynb
#https://docs.python.org/3/library/re.html

#Returns number of lowercase chars in the text
def lowercaseCount(text):
    lowercount=0
    for w in text:
        for char in w:
            if(char.islower()):
                lowercount+=1
    return lowercount

#Returns number of uppercase chars in the text
def uppercaseCount(text):
    uppercount=0
    for w in text:
        for char in w:
            if(char.isupper()):
                uppercount+=1
    return uppercount
    
#Sentence tokenization
from nltk import sent_tokenize
#Tokenizes sentences into sentence list
def getSentTokens(sentences):
    return sent_tokenize(sentences)

#Word tokenization
from nltk import word_tokenize
#Tokenizes sentence 1D list into words 2D list
def getWordTokens(sentenceList):
    return [word_tokenize(s) for s in sentenceList]

#Punctuations removal
def removePunctuations(word):
    return re.sub(r'\W+', '', word)

#Stop words removal from list of words
from nltk.corpus import stopwords
eng_stop=set(stopwords.words('english'))
#print(eng_stop)
def removeStopwords(text):
    return [word for word in text if word not in eng_stop]

def sentencesStartWithYouCount(tokens):
    count=0
    for sentence in tokens:
        if(sentence[0]=="you"):
            #print(s)
            count=count+1
    return count;

def sentencesStartWithICount(tokens):
    count=0
    for sentence in tokens:
        if(sentence[0]=="i"):
            #print(s)
            count=count+1
    return count;

In [100]:
text = ("This's is@ 9 short, sentence. Amazing! I have tried an approach. I am trying this. I'm trying. " 
       "You should suggest some, too. You've done this before?")
print("Lowercase count:"+str(lowercaseCount(text)))
print("Uppercase count:"+str(uppercaseCount(text)))
print("Lowercase chars to Upper chars ratio:"+str( lowercaseCount(text)/uppercaseCount(text) ))
text=text.lower()
print(text);

Lowercase count:103
Uppercase count:7
Lowercase chars to Upper chars ratio:14.714285714285714
this's is@ 9 short, sentence. amazing! i have tried an approach. i am trying this. i'm trying. you should suggest some, too. you've done this before?


In [85]:
sentences=getSentTokens(text)
print("Sentence count:"+str(len(sentences)))
print(sentences)

Sentence count:7
["this's is@ 9 short, sentence.", 'amazing!', 'i have tried an approach.', 'i am trying this.', "i'm trying.", 'you should suggest some, too.', "you've done this before?"]


In [86]:
tokens=getWordTokens(sentences)
#tokens is 2D list
for sentence in tokens:
    print(sentence)

['this', "'s", 'is', '@', '9', 'short', ',', 'sentence', '.']
['amazing', '!']
['i', 'have', 'tried', 'an', 'approach', '.']
['i', 'am', 'trying', 'this', '.']
['i', "'m", 'trying', '.']
['you', 'should', 'suggest', 'some', ',', 'too', '.']
['you', "'ve", 'done', 'this', 'before', '?']


In [87]:
print("Number of sentences starting with you:"+str(sentencesStartWithYouCount(tokens)))
print("Number of sentences starting with i:"+str(sentencesStartWithICount(tokens)))

Number of sentences starting with you:2
Number of sentences starting with i:3


In [88]:
tokens=[removePunctuations(w) for s in tokens for w in s]
tokens=list( filter(None,tokens) )
print(tokens)
print(len(tokens))

['this', 's', 'is', '9', 'short', 'sentence', 'amazing', 'i', 'have', 'tried', 'an', 'approach', 'i', 'am', 'trying', 'this', 'i', 'm', 'trying', 'you', 'should', 'suggest', 'some', 'too', 'you', 've', 'done', 'this', 'before']
29


In [89]:
tokens=removeStopwords(tokens)
print(tokens)
print(len(tokens))

['9', 'short', 'sentence', 'amazing', 'tried', 'approach', 'trying', 'trying', 'suggest', 'done']
10


In [97]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
