In [1]:
# all of the imports
import pandas as pd
import numpy as np
import pickle 
import patsy
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn import preprocessing as pp
import warnings
warnings.filterwarnings('ignore')


In [2]:
from bs4 import BeautifulSoup


import os
import re
import html as ihtml

In [3]:
import nltk
import itertools
from nltk.probability import FreqDist
nltk.download('stopwords')
stopset = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\SUMIT
[nltk_data]     PAL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Importing StackOverflow Data

In [5]:
questions = pd.read_csv('Questions.csv',encoding='latin1')

In [6]:
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607282 entries, 0 to 607281
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Id            607282 non-null  int64  
 1   OwnerUserId   601070 non-null  float64
 2   CreationDate  607282 non-null  object 
 3   Score         607282 non-null  int64  
 4   Title         607282 non-null  object 
 5   Body          607282 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 27.8+ MB


In [7]:
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...


# Data Processing

In [8]:
questions['Body'][0]

"<p>I am using the Photoshop's javascript API to find the fonts in a given PSD.</p>\n\n<p>Given a font name returned by the API, I want to find the actual physical font file that that font name corresponds to on the disc.</p>\n\n<p>This is all happening in a python program running on OSX so I guess I'm looking for one of:</p>\n\n<ul>\n<li>Some Photoshop javascript</li>\n<li>A Python function</li>\n<li>An OSX API that I can call from python</li>\n</ul>\n"

### 1 Removing html tags from questions

In [9]:
def clean_text(text):
    text = BeautifulSoup(ihtml.unescape(text), "lxml").text
    text = re.sub(r"http[s]?://\S+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

In [10]:
clean_text(questions['Body'][0])

"I am using the Photoshop's javascript API to find the fonts in a given PSD. Given a font name returned by the API, I want to find the actual physical font file that that font name corresponds to on the disc. This is all happening in a python program running on OSX so I guess I'm looking for one of: Some Photoshop javascript A Python function An OSX API that I can call from python "

In [11]:
questions['Body'] = questions['Body'].apply(clean_text)

In [12]:
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,I am using the Photoshop's javascript API to f...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,I have a cross-platform (Python) application w...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,I'm starting work on a hobby project with a py...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,There are several ways to iterate over a resul...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,I don't remember whether I was dreaming or not...


In [13]:
tags = pd.read_csv('Tags.csv')

In [14]:
tags.head()

Unnamed: 0,Id,Tag
0,469,python
1,469,osx
2,469,fonts
3,469,photoshop
4,502,python


### 2 Converting all questions to lowercase

In [15]:
questions['Body Lower'] = questions['Body'].apply(lambda text: text.lower())

In [16]:
questions['Body Lower'][0]

"i am using the photoshop's javascript api to find the fonts in a given psd. given a font name returned by the api, i want to find the actual physical font file that that font name corresponds to on the disc. this is all happening in a python program running on osx so i guess i'm looking for one of: some photoshop javascript a python function an osx api that i can call from python "

### 3 Removing puctuation in questions

In [17]:
import string

In [18]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [19]:
def remove_punctuation(text):
    cleaned_text = text.translate(str.maketrans('','',string.punctuation));
    return cleaned_text

In [20]:
questions['Cleaned Body'] = questions['Body Lower'].apply(remove_punctuation);

In [21]:
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body,Body Lower,Cleaned Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,I am using the Photoshop's javascript API to f...,i am using the photoshop's javascript api to f...,i am using the photoshops javascript api to fi...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,I have a cross-platform (Python) application w...,i have a cross-platform (python) application w...,i have a crossplatform python application whic...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,I'm starting work on a hobby project with a py...,i'm starting work on a hobby project with a py...,im starting work on a hobby project with a pyt...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,There are several ways to iterate over a resul...,there are several ways to iterate over a resul...,there are several ways to iterate over a resul...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,I don't remember whether I was dreaming or not...,i don't remember whether i was dreaming or not...,i dont remember whether i was dreaming or not ...


### 4 Tokenized questions

In [23]:
clean_questions = questions[['Id','Cleaned Body']]

In [24]:
clean_questions['Tokenized'] = clean_questions['Cleaned Body'].apply(lambda text: text.split())

In [25]:
clean_questions.head()

Unnamed: 0,Id,Cleaned Body,Tokenized
0,469,i am using the photoshops javascript api to fi...,"[i, am, using, the, photoshops, javascript, ap..."
1,502,i have a crossplatform python application whic...,"[i, have, a, crossplatform, python, applicatio..."
2,535,im starting work on a hobby project with a pyt...,"[im, starting, work, on, a, hobby, project, wi..."
3,594,there are several ways to iterate over a resul...,"[there, are, several, ways, to, iterate, over,..."
4,683,i dont remember whether i was dreaming or not ...,"[i, dont, remember, whether, i, was, dreaming,..."


### 5 Removing stop words

In [22]:
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
              "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
              "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these",
              "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do",
              "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while",
              "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
              "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
              "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
              "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
              "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

In [26]:
def remove_stopped(tokenized):
    final_words =[]
    for word in tokenized:
        if word not in stop_words:
            final_words.append(word)
    return final_words

In [27]:
clean_questions['remove_stop'] = clean_questions['Tokenized'].apply(remove_stopped)

In [29]:
clean_questions.head()

Unnamed: 0,Id,Cleaned Body,Tokenized,remove_stop
0,469,i am using the photoshops javascript api to fi...,"[i, am, using, the, photoshops, javascript, ap...","[using, photoshops, javascript, api, find, fon..."
1,502,i have a crossplatform python application whic...,"[i, have, a, crossplatform, python, applicatio...","[crossplatform, python, application, needs, ge..."
2,535,im starting work on a hobby project with a pyt...,"[im, starting, work, on, a, hobby, project, wi...","[im, starting, work, hobby, project, python, c..."
3,594,there are several ways to iterate over a resul...,"[there, are, several, ways, to, iterate, over,...","[several, ways, iterate, result, set, tradeoff]"
4,683,i dont remember whether i was dreaming or not ...,"[i, dont, remember, whether, i, was, dreaming,...","[dont, remember, whether, dreaming, seem, reca..."


In [38]:
emotions_dict = {}
with open('emotions.txt','r') as file:
    for line in file:
        clear_line = line.replace('\n','').replace(',','').replace("'",'')
        word, emotion = clear_line.split(':')
        print("Word : {0} Emotion : {1}".format(word,emotion))
        emotions_dict[word] = emotion

Word :  victimized Emotion :  cheated
Word :  accused Emotion :  cheated
Word :  acquitted Emotion :  singled out
Word :  adorable Emotion :  loved
Word :  adored Emotion :  loved
Word :  affected Emotion :  attracted
Word :  afflicted Emotion :  sad
Word :  aghast Emotion :  fearful
Word :  agog Emotion :  attracted
Word :  agonized Emotion :  sad
Word :  alarmed Emotion :  fearful
Word :  amused Emotion :  happy
Word :  angry Emotion :  angry
Word :  anguished Emotion :  sad
Word :  animated Emotion :  happy
Word :  annoyed Emotion :  angry
Word :  anxious Emotion :  attracted
Word :  apathetic Emotion :  bored
Word :  appalled Emotion :  angry
Word :  appeased Emotion :  singled out
Word :  appreciated Emotion :  esteemed
Word :  apprehensive Emotion :  fearful
Word :  approved of Emotion :  loved
Word :  ardent Emotion :  lustful
Word :  aroused Emotion :  lustful
Word :  attached Emotion :  attached
Word :  attracted Emotion :  attracted
Word :  autonomous Emotion :  independent
W

In [46]:
def create_emotions(text):
    emotions_list = []
    for word in emotions_dict:
        if word in text:
            emotions_list.append(emotions_dict[word])
    return emotions_list

In [47]:
clean_questions['emotions_question'] = clean_questions['remove_stop'].apply(create_emotions)

In [48]:
clean_questions.to_csv('stack_emotions.csv')

In [54]:
from collections import Counter

In [55]:
clean_questions['Count emotion'] = clean_questions['emotions_question'].apply(lambda x:Counter(x))

# Sentiment Analysis

In [60]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [65]:
 nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to C:\Users\SUMIT
[nltk_data]     PAL\AppData\Roaming\nltk_data...


True

In [61]:
clean_questions.head()

Unnamed: 0,Id,Cleaned Body,Tokenized,remove_stop,emotions_question,Count emotion
0,469,i am using the photoshops javascript api to fi...,"[i, am, using, the, photoshops, javascript, ap...","[using, photoshops, javascript, api, find, fon...",[],{}
1,502,i have a crossplatform python application whic...,"[i, have, a, crossplatform, python, applicatio...","[crossplatform, python, application, needs, ge...",[],{}
2,535,im starting work on a hobby project with a pyt...,"[im, starting, work, on, a, hobby, project, wi...","[im, starting, work, hobby, project, python, c...",[],{}
3,594,there are several ways to iterate over a resul...,"[there, are, several, ways, to, iterate, over,...","[several, ways, iterate, result, set, tradeoff]",[],{}
4,683,i dont remember whether i was dreaming or not ...,"[i, dont, remember, whether, i, was, dreaming,...","[dont, remember, whether, dreaming, seem, reca...",[],{}


In [85]:
def sentiment_analyse(sentiment_text):
    score = SentimentIntensityAnalyzer().polarity_scores(sentiment_text)
    return score

In [None]:
clean_questions_stack = clean_questions[['Id','Cleaned Body']]

In [86]:
clean_questions_stack['score'] = clean_questions_stack['Cleaned Body'].apply(sentiment_analyse)

In [93]:
clean_questions_stack.head()

Unnamed: 0,Id,Cleaned Body,score
0,469,i am using the photoshops javascript api to fi...,"{'neg': 0.0, 'neu': 0.981, 'pos': 0.019, 'comp..."
1,502,i have a crossplatform python application whic...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
2,535,im starting work on a hobby project with a pyt...,"{'neg': 0.072, 'neu': 0.863, 'pos': 0.065, 'co..."
3,594,there are several ways to iterate over a resul...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,683,i dont remember whether i was dreaming or not ...,"{'neg': 0.0, 'neu': 0.928, 'pos': 0.072, 'comp..."


In [90]:
clean_questions_stack.drop(['negative','neutral','positive','compound'],axis=1,inplace=True)

In [91]:
clean_questions_stack.head()

Unnamed: 0,Id,Cleaned Body,score
0,469,i am using the photoshops javascript api to fi...,"{'neg': 0.0, 'neu': 0.981, 'pos': 0.019, 'comp..."
1,502,i have a crossplatform python application whic...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
2,535,im starting work on a hobby project with a pyt...,"{'neg': 0.072, 'neu': 0.863, 'pos': 0.065, 'co..."
3,594,there are several ways to iterate over a resul...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,683,i dont remember whether i was dreaming or not ...,"{'neg': 0.0, 'neu': 0.928, 'pos': 0.072, 'comp..."


In [92]:
clean_questions_stack.to_csv('sentiment_stack.csv')

In [94]:
clean_questions_stack[['Cleaned Body','score']].head(30)

Unnamed: 0,Cleaned Body,score
0,i am using the photoshops javascript api to fi...,"{'neg': 0.0, 'neu': 0.981, 'pos': 0.019, 'comp..."
1,i have a crossplatform python application whic...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
2,im starting work on a hobby project with a pyt...,"{'neg': 0.072, 'neu': 0.863, 'pos': 0.065, 'co..."
3,there are several ways to iterate over a resul...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,i dont remember whether i was dreaming or not ...,"{'neg': 0.0, 'neu': 0.928, 'pos': 0.072, 'comp..."
5,django view points to a function which can be ...,"{'neg': 0.036, 'neu': 0.916, 'pos': 0.049, 'co..."
6,i can get python to work with postgresql but i...,"{'neg': 0.099, 'neu': 0.764, 'pos': 0.137, 'co..."
7,i havent been able to find an understandable e...,"{'neg': 0.035, 'neu': 0.885, 'pos': 0.081, 'co..."
8,ive read that it is possible to add a method t...,"{'neg': 0.077, 'neu': 0.872, 'pos': 0.051, 'co..."
9,how do you express an integer as a binary numb...,"{'neg': 0.045, 'neu': 0.868, 'pos': 0.087, 'co..."
