## Author : Sayali Kudale

### This Notebook does the Data-Preprocessing activity

In [1]:
import warnings
warnings.filterwarnings("ignore") 

from bs4 import BeautifulSoup
import html as ihtml
import re
import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import pandas as pd
from string import punctuation
from nltk import word_tokenize, pos_tag

In [2]:
# Read CSV files to get questions and answers
df_questions = pd.read_csv("Data/QuestionsFrequent.csv", encoding="ISO-8859-1")

In [3]:
len(df_questions)

83156

### Data before Preprocessing

In [4]:
df_questions.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Score,OwnerUserId,AcceptedAnswerId,AnswerOwnerUserId,AnswerCreationDate,AnswerScore
0,59549471,Why a named function declaration isn't hoisted...,<p>is putting the function declaration beside ...,javascript,2020-01-01 01:27:00,1,9451225.0,59549481,9515207.0,2020-01-01 01:29:28,4
1,59549415,Alarm Manager not woking,<p>I am using AlarmManager on clicking floatin...,java android,2020-01-01 01:07:11,1,8552968.0,59549519,8012516.0,2020-01-01 01:42:03,3
2,59551132,how to stop service when click home button?,<p>I have a <code>SoundService</code> for play...,android android-studio,2020-01-01 08:54:25,1,970404.0,59551203,12612238.0,2020-01-01 09:07:11,3
3,59551057,Copy attributes from case class,"<pre><code>case class A(a:Int ,b:Int,c:Int,d:I...",scala,2020-01-01 08:38:05,1,2845414.0,59551349,4993128.0,2020-01-01 09:34:35,3
4,59550793,combine multiple dataframes in a csv file sepa...,<p>how can I separate each dataframe with an e...,python pandas,2020-01-01 07:43:02,1,12225277.0,59550833,9840637.0,2020-01-01 07:50:45,4


### Check null and empty data 

In [5]:
df_questions.isnull().sum()

Id                       0
Title                    0
Body                     0
Tags                     0
CreationDate             0
Score                    0
OwnerUserId           1333
AcceptedAnswerId         0
AnswerOwnerUserId      180
AnswerCreationDate       0
AnswerScore              0
dtype: int64

In [6]:
# deleting the ownerID null columns 
df_questions = df_questions[df_questions['OwnerUserId'].notnull()]
df_questions = df_questions[df_questions['AnswerOwnerUserId'].notnull()]

In [7]:
df_questions.isnull().sum()

Id                    0
Title                 0
Body                  0
Tags                  0
CreationDate          0
Score                 0
OwnerUserId           0
AcceptedAnswerId      0
AnswerOwnerUserId     0
AnswerCreationDate    0
AnswerScore           0
dtype: int64

### Total number of Questions after removing null data

In [8]:
len(df_questions)

81678

### Data PreProcessing function

1. remove_punctuation : This function will remove all the punctuactions from the data. Punctuation library is used.
2. special_tags : Special Tags such as C++, C#, .net are taken care before removing the special characters from the data
3. extendedStopWords : Extended list of stopwords found via analysis (This list also contains some of the unnecessary words which are already taken care by the stoword removal and pos tagging. However, they are still retain to perform extra validation )
4. clean_text : This function performs below activities:
    1. space regularization 
    2. code block removal 
    3. html tags removal 
    4. hyperlinks removal 
    5. lowercasing the text
    6. abbreviation removal
    7. remove extra spacing 
    8. Pos Tagging and retaining only noun forms 
    9. Word lemmatization
    

In [9]:
def remove_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

def special_tags(text):
    text = re.sub(r"c\+\+", "cplusplus", text)
    text = re.sub(r"c\#", "csharp", text)
    text = re.sub(r"\.net", "dotnet", text)

    return text

def extendedStopWords(text):
    extendedStopWords = ['code','please','thank','thanks','appreciate','fine','help','according', 'accordingly','write' ,'program','use',
                         'cause','anyone','basic','allow','valid','within','answer','proper','across', 'act', 'asked','ask','success',
                         'actually','need','would','like','tried','to','in','how','the','of','with','and','from','I','is','for',
                         'an','on','want','can','way','say','said','says','seem','seems','when','not','do','Why','it','use','used','as',
                         'Is','that','How','or','by','following','however','try','tried','trying','get','getting','problem','help','can',
                         'could','issue','wrong','fine','run','also','know','example','see','based','find','something','thing','found',
                         'treated','necessarily','work','working','worked','make','implemented','implement','another','one','two','differ',
                         'different','depend','question','look','looked','understand','attempt','create','solution','possible','multiple', 
                         'able', 'even', 'check', 'hope', 'exist', 'someone', 'lot','case','perform','happen','option','achiev','via','got',
                         'without','etc','idea','per','given','set','give','show','shown','using','use','expect','line','keep','advanced',
                         'advance','connect','input','output','assign','take','always','syntax','similar','content','define','defined',
                         'new','execute','executed','specified','started','added','generated','generate','supported','everything','well',
                         'sure','fail','failed','failing','simple','task','require','available','inform','copy','copied','inside','default',
                         'detailed','previous','previously','exact','project','current','currently','though','although','confuse','approached',
                         'approach','since','really','student','department','employee']    
    filtered= [w for w in str(text).split() if w not in extendedStopWords]
    text=' '.join(map(str, filtered))
    
    return text
        

def clean_text(text):

    #regularizing spacing
    text = re.sub(r"\s+", " ", text)
    #removing code blocks
    text = re.sub(r'<code>.+?</code>', '', text)
    #removing html
    text = BeautifulSoup(ihtml.unescape(text), "lxml").text 
    #removing hyperlinks 
    text = re.sub(r"http[s]?://\S+", "", text)
    
    #lowercase the text
    text = text.lower()
    

    #remove the abbereviation
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    
    #take care of the tags such as c++,c#     
    text=special_tags(text)

    #remove punctuations 
    text= remove_punctuation(text)
    # remove extra spaces
    text = re.sub(' +',' ',text)
    
    #remove stop words
    
    filtered= [w for w in text.split() if w not in stop_words]
    text=' '.join(map(str, filtered))
    
    text =' '.join( [token for token, pos in pos_tag(word_tokenize(text)) if pos.startswith('N')])
    
    text=extendedStopWords(text)
    #Lemmatization
    tokens = nltk.word_tokenize(text)
    text = ' '.join([lemma.lemmatize(w) for w in tokens])
    
    
    return text


lemma=WordNetLemmatizer()

stop_words = set(stopwords.words("english"))

#### Clean the body column data

In [10]:
df_questions['Body'] = df_questions['Body'].apply(lambda x: clean_text(x)) 

#### Clean the Title  column data

In [11]:
df_questions['Title'] = df_questions['Title'].apply(lambda x: clean_text(x)) 

In [12]:
#converting float column to int
df_questions["OwnerUserId"] = df_questions["OwnerUserId"].astype(int)
df_questions["AnswerOwnerUserId"] = df_questions["AnswerOwnerUserId"].astype(int)

#### data processing on tag column

In [13]:
# remove the < > in tags 

df_questions['Tags'] =  df_questions['Tags'].apply(lambda x: x.replace('<','').replace('>',' ')) 

df_questions['Tags'] =  df_questions['Tags'].apply(lambda x: special_tags(x)) 



#### Combine Body and Title as single column as QuestionText

In [14]:
df_questions["QuestionText"] = df_questions["Title"] +" " +df_questions["Body"] 

#### Remove the unnecessary columns from the data

In [15]:
df_questions=df_questions.drop(["Score","OwnerUserId","Body","Title"],axis=1)


#### Visualization of the clean data 

In [16]:
df_questions.head()

Unnamed: 0,Id,Tags,CreationDate,AcceptedAnswerId,AnswerOwnerUserId,AnswerCreationDate,AnswerScore,QuestionText
0,59549471,javascript,2020-01-01 01:27:00,59549481,9515207,2020-01-01 01:29:28,4,function declaration statement function declar...
1,59549415,java android,2020-01-01 01:07:11,59549519,8012516,2020-01-01 01:42:03,3,alarm manager alarmmanager floatingactionbutto...
2,59551132,android android-studio,2020-01-01 08:54:25,59551203,12612238,2020-01-01 09:07:11,3,service click home button play sound load star...
3,59551057,scala,2020-01-01 08:38:05,59551349,4993128,2020-01-01 09:34:35,3,class class object b scala
4,59550793,python pandas,2020-01-01 07:43:02,59550833,9840637,2020-01-01 07:50:45,4,combine dataframes file row dataframe row ive ...


In [17]:
df_questions["QuestionText"].iloc[2]

'service click home button play sound load start service splashactivity service button callback mainactivity service home button activity android androidstudio'

In [18]:
# save the processed data into  file 
df_questions.to_csv("Data/SO_ProcessedData.csv", index=False)