In [1]:
# Import necessary libraries in this cell
import pandas as pd #pandas is a library for data wrangling/handling
import numpy as np #same case for numpy

# Libraries for helping us with strings
import string
# Regular Expression Library
import re

# Seaborn / matplotlib for visualization 
import seaborn as sns
# This command tells python to use seaborn for its styling.
sns.set()


# Matplotlib is also a very useful, basic visualization/plotting library
import matplotlib.pyplot as plt
# Very important, this will make your charts appear in your notebook instead of in a new window.
%matplotlib inline


# Provides z-score helper function,
# z-score uses standard deviation to remove outliers
# (industry standard is if a data point is 3 std devs away from mean,
# it's considered to be an outlier)
import scipy.stats as stats


# Ignore this, this is just for displaying images.
from IPython.display import Image


# Importing sklearn library
import sklearn

# Import the trees from sklearn
from sklearn import tree

# Metrics help us score our model, using metrics to evaluate our model
from sklearn import metrics

# Import our Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
# Import our Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

# Import our text vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# This is our Logit model
from sklearn.linear_model import LogisticRegression
# Importing our linear regression model
from sklearn.linear_model import LinearRegression

# Helper fuctions to evaluate our model from sklearn, including f1_score.
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score
# Some more helpful ML function
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import classification_report

# Helper function to split our data for testing and training purposes
from sklearn.model_selection import train_test_split
# Helper function for hyper-parameter turning.
from sklearn.model_selection import GridSearchCV

# Import MultinomaialNB classifier
from sklearn.naive_bayes import MultinomialNB

# Import our Decision Tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import SVC


# Library for visualizing our tree
# If you get an error, 
# run 'conda install python-graphviz' in your terminal (without the quotes).
import graphviz 


# NLTK is our Natural-Language-Took-Kit
import nltk

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet

# You may need to download these from nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
stopwords = stopwords.words('english')


In [2]:
#First dataframe
df_reddit_data = pd.read_csv('data/Reddit_Data.csv')
print("Number of rows and columns in dataframe: " + str(df_reddit_data.shape), '\n')
df_reddit_data.head()

Number of rows and columns in dataframe: (37249, 2) 



Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [3]:
# Checking for nulls
print(df_reddit_data.isnull().sum(), '\n')
print(((df_reddit_data.isnull().sum() / len(df_reddit_data)) *100).round(2))

clean_comment    100
category           0
dtype: int64 

clean_comment    0.27
category         0.00
dtype: float64


In [4]:
# Dropping nulls
df_reddit_data = df_reddit_data.dropna()

In [5]:
# Checking for duplicates
print("Number of dupes are " + str(df_reddit_data.duplicated().sum()))

Number of dupes are 350


In [6]:
#Checking which line exactly were duplicated
condition = df_reddit_data.duplicated() == True
df_reddit_data.duplicated()[condition]

375      True
392      True
617      True
651      True
1222     True
         ... 
36915    True
37044    True
37125    True
37158    True
37234    True
Length: 350, dtype: bool

In [7]:
#Dropping dupes
df_reddit_data = df_reddit_data.drop_duplicates()
print(df_reddit_data.duplicated().sum())

0


In [8]:
# Sanity Checking
print('Number of nulls: ' + str(df_reddit_data.isnull().sum()))
print("\nNumber of dupes are " + str(df_reddit_data.duplicated().sum()))
# print(str(df_reddit_data.duplicated()[condition]))
print('\nNumber of rows after cleaning data: ', df_reddit_data.shape[0])

Number of nulls: clean_comment    0
category         0
dtype: int64

Number of dupes are 0

Number of rows after cleaning data:  36799


In [9]:
# Checking for the type of columns, to see if some are faulty
# including duplicated columns or faulty ones that don't have an name for example
df_reddit_data.columns

Index(['clean_comment', 'category'], dtype='object')

In [10]:
#Second dataframe
df_twitter_data = pd.read_csv('data/Twitter_Data.csv')
print("Number of rows and columns in dataframe: " + str(df_twitter_data.shape), '\n')
df_twitter_data.head()

Number of rows and columns in dataframe: (162980, 2) 



Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [11]:
# Checking for nulls
print(df_twitter_data.isnull().sum(), '\n')
print(((df_twitter_data.isnull().sum() / len(df_twitter_data)) *100).round(4))

clean_text    4
category      7
dtype: int64 

clean_text    0.0025
category      0.0043
dtype: float64


In [12]:
df_twitter_data = df_twitter_data.dropna()

In [13]:
print("Number of dupes are " + str(df_twitter_data.duplicated().sum()))

Number of dupes are 0


In [14]:
# Checking for the type of columns, to see if some are faulty
# including duplicated columns or faulty ones that don't have an name for example
df_twitter_data.columns

Index(['clean_text', 'category'], dtype='object')

In [15]:
# Renaming 'clean_text' to 'clean_comment' like the reddit dataframe has
# to facilitate the merging of both dataframes
df_twitter_data = df_twitter_data.rename(columns = { "clean_text" : "clean_comment"})

In [16]:
# Sanity Checking
print('Number of nulls: ' + str(df_twitter_data.isnull().sum()))
print("\nNumber of dupes are " + str(df_twitter_data.duplicated().sum()))
# print(str(df_twitter_data.duplicated()[condition]))
print('\nNumber of rows after cleaning data: ', df_twitter_data.shape[0])

Number of nulls: clean_comment    0
category         0
dtype: int64

Number of dupes are 0

Number of rows after cleaning data:  162969


In [17]:
#Third dataframe
df_bad_words = pd.read_csv('data/bad-words.csv')
print(df_bad_words.shape, '\n')
df_bad_words.head()

(1616, 1) 



Unnamed: 0,jigaboo
0,mound of venus
1,asslover
2,s&m
3,queaf
4,whitetrash


In [18]:
# Checking for nulls
print(df_bad_words.isnull().sum(), '\n')
print(((df_bad_words.isnull().sum() / len(df_bad_words)) *100).round(2))

jigaboo    0
dtype: int64 

jigaboo    0.0
dtype: float64


In [19]:
# Checking for duplicates
print("Number of dupes are " + str(df_bad_words.duplicated().sum()))

Number of dupes are 0


In [20]:
# Renaming the column to something more clear and understandable
df_bad_words = df_bad_words.rename(columns = { "jigaboo" : "Profanity"})

In [21]:
# Sanity Checking
print('Number of nulls: ' + str(df_bad_words.isnull().sum()))
print("\nNumber of dupes are " + str(df_bad_words.duplicated().sum()))
# print(str(df_reddit_data.duplicated()[condition]))
print('\nNumber of rows after cleaning data: ', df_bad_words.shape[0])

Number of nulls: Profanity    0
dtype: int64

Number of dupes are 0

Number of rows after cleaning data:  1616


In [35]:
#Fourth dataframe
# change underscores to hypens !!important
df_more_bad_words = pd.read_csv('data/more_bad_words.csv')
print(df_more_bad_words.shape, '\n')
df_more_bad_words.head()

(100, 1) 



Unnamed: 0,Profanity
0,americunt
1,as_hell
2,ass
3,asshole
4,bastard


In [36]:
# Checking for nulls
print(df_more_bad_words.isnull().sum(), '\n')
print(((df_more_bad_words.isnull().sum() / len(df_more_bad_words)) *100).round(2))

Profanity    0
dtype: int64 

Profanity    0.0
dtype: float64


In [39]:
# Checking for duplicates
print("Number of dupes are " + str(df_more_bad_words.duplicated().sum()))

Number of dupes are 1


In [40]:
#Checking which line exactly were duplicated
condition = df_more_bad_words.duplicated() == True
df_more_bad_words.duplicated()[condition]

98    True
dtype: bool

In [41]:
#Dropping dupes
df_more_bad_words = df_more_bad_words.drop_duplicates()
print(df_more_bad_words.duplicated().sum())

0


In [43]:
# Sanity Checking
print('Number of nulls: ' + str(df_more_bad_words.isnull().sum()))
print("\nNumber of dupes are " + str(df_more_bad_words.duplicated().sum()))
# print(str(df_reddit_data.duplicated()[condition]))
print('\nNumber of rows after cleaning data: ', df_more_bad_words.shape[0])

Number of nulls: Profanity    0
dtype: int64

Number of dupes are 0

Number of rows after cleaning data:  99


In [22]:
# My attempt at merging the reddit and twitter datasets together
list_of_dataframes = [df_reddit_data, df_twitter_data]
df_posts = pd.concat(list_of_dataframes)

In [23]:
print(str(df_posts.shape))
df_posts.head()

(199768, 2)


Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1.0
1,buddhism has very much lot compatible with chr...,1.0
2,seriously don say thing first all they won get...,-1.0
3,what you have learned yours and only yours wha...,0.0
4,for your own benefit you may want read living ...,1.0


In [24]:
# Just some more sanity checking
df_posts.category.unique()

array([ 1., -1.,  0.])

In [25]:
# The index values of each row were messed up after cleaning the data
# and merging/concatinating the data, so I used the following to make sure that 
# the index values in the new dataframe correspond to the correct tuple
df_posts.reset_index()

Unnamed: 0,index,clean_comment,category
0,0,family mormon have never tried explain them t...,1.0
1,1,buddhism has very much lot compatible with chr...,1.0
2,2,seriously don say thing first all they won get...,-1.0
3,3,what you have learned yours and only yours wha...,0.0
4,4,for your own benefit you may want read living ...,1.0
...,...,...,...
199763,162975,why these 456 crores paid neerav modi not reco...,-1.0
199764,162976,dear rss terrorist payal gawar what about modi...,-1.0
199765,162977,did you cover her interaction forum where she ...,0.0
199766,162978,there big project came into india modi dream p...,0.0


In [26]:
df_posts = df_posts.reset_index()
df_posts = df_posts.drop(columns = 'index')

In [27]:
# Checking once again
df_posts



# Up until this part, everything went well. After this, I tried to merge the profanity data sets together.
# Kukai suggested using a function that will reduce the phrase/term to the root value, so I'll lokk into that

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1.0
1,buddhism has very much lot compatible with chr...,1.0
2,seriously don say thing first all they won get...,-1.0
3,what you have learned yours and only yours wha...,0.0
4,for your own benefit you may want read living ...,1.0
...,...,...
199763,why these 456 crores paid neerav modi not reco...,-1.0
199764,dear rss terrorist payal gawar what about modi...,-1.0
199765,did you cover her interaction forum where she ...,0.0
199766,there big project came into india modi dream p...,0.0


In [44]:
# My attempt at merging the profanity datasets together
list_of_dataframes = [df_bad_words, df_more_bad_words]
df_profanity = pd.concat(list_of_dataframes)

In [53]:
print(str(df_profanity.shape))
df_profanity.head()

(1715, 1)


Unnamed: 0,Profanity
0,mound of venus
1,asslover
2,s&m
3,queaf
4,whitetrash


In [56]:
# Just some more sanity checking
df_profanity.Profanity.unique()

array(['mound of venus', 'asslover', 's&m', ..., 'worthless', 'douche',
       'islamophobia'], dtype=object)

In [57]:
# The index values of each row were messed up after cleaning the data
# and merging/concatinating the data, so I used the following to make sure that 
# the index values in the new dataframe correspond to the correct tuple
df_profanity.reset_index()

Unnamed: 0,index,Profanity
0,0,mound of venus
1,1,asslover
2,2,s&m
3,3,queaf
4,4,whitetrash
...,...,...
1710,94,retarded
1711,95,delusional
1712,96,worthless
1713,97,douche


In [58]:
df_profanity = df_profanity.reset_index()
df_profanity = df_profanity.drop(columns = 'index')

In [59]:
# Checking once again
df_profanity

Unnamed: 0,Profanity
0,mound of venus
1,asslover
2,s&m
3,queaf
4,whitetrash
...,...
1710,retarded
1711,delusional
1712,worthless
1713,douche


In [78]:
# list of profane words that shouldn't be allowed
profanity_list = df_profanity.Profanity

In [79]:
profanity_list

0       mound of venus
1             asslover
2                  s&m
3                queaf
4           whitetrash
             ...      
1710          retarded
1711        delusional
1712         worthless
1713            douche
1714      islamophobia
Name: Profanity, Length: 1715, dtype: object

In [70]:
df_profanity.columns

Index(['Profanity'], dtype='object')

In [71]:
# Function to filer words out using profanity from df_bad_words
def remove_profanity(profane_str):
    
    words = word_tokenize(profane_str)
    
    valid_words = []
    
    for word in words:
        
        if word not in profanity_list:
            
            valid_words.append(word)
            
    profane_str = ' '.join(valid_words)
    
    return profane_str

In [72]:
#Seems like good practice to me, creating a single function that will call all
# our necessary functions from one place, will be subject to change
def text_pipeline(input_str):
    input_str = remove_profanity(input_str)
    return input_str
    

In [99]:
# df[df['A'].str.contains("hello")]
print(str(df_profanity[df_profanity['Profanity'].str.contains("fuck")]))
# condition = profanity_list == "fuck"
# profanity_list[condition]

           Profanity
52     fingerfucker 
55           assfuck
58      mothafucked 
67           fuckers
95        cuntfucker
...              ...
1652          fucks 
1653       fucktard 
1656    goatfuckers.
1672    motherfucker
1673  motherfucking 

[117 rows x 1 columns]


In [None]:
#turn profanity_list into a set rather than a dataframe so that its faster (constant time vs O(n))

In [None]:
#look into root functions 

In [101]:
print(str(df_posts[df_posts['clean_comment_profane_free'].str.contains("fuck")]))

                                            clean_comment  category  \
27         demogorgon because fuck you and your shit god       -1.0   
51       tea partier expresses support for namo after ...      -1.0   
99      scheduled castes day before marriage parties a...      -1.0   
118      deserves you think deserve such mainstream me...      -1.0   
204      what the fuck giroud know only vietnam but damn       -1.0   
...                                                   ...       ...   
195239  this means modi finally focusing elections onl...      -1.0   
195554  agree opposition too weak stand against bjp pr...      -1.0   
197211  are all varanasi dumbfucks millionaires modi a...       0.0   
198505  under modi government women are being empowere...       1.0   
199105  who the fuck saying this modi right even said ...      -1.0   

                               clean_comment_profane_free  
27          demogorgon because fuck you and your shit god  
51      tea partier express

In [90]:
# testing cell
# df_posts['clean_comment_profane_free'][12802]
df_posts['clean_comment_profane_free'].apply(text_pipeline)
print(str(df_posts['clean_comment_profane_free'][12802]))

fuck


In [91]:
profane_str = df_posts['clean_comment'][12802]
profane_str

'fuck'

In [100]:
profane_str = df_posts['clean_comment'][12802]

print(profane_str)

words = word_tokenize(profane_str)

print('\nstr words: ', str(words))

valid_words = []
    
print("In for loop \n")
for word in words:
    print('\nif loop out: ', str(word))
    if word not in profanity_list:
        print('\nif loop in: ', str(word))
        valid_words.append(word)
            
profane_str = ' '.join(valid_words)
print(str(valid_words), '\n')
print('\n', profane_str)
    


fuck

str words:  ['fuck']
In for loop 


if loop out:  fuck

if loop in:  fuck
['fuck'] 


 fuck


In [83]:
df_posts['clean_comment_profane_free'] = df_posts['clean_comment']
df_posts['clean_comment_profane_free'] = df_posts['clean_comment'].apply(text_pipeline)

In [86]:
# So based off of some testing, the data set with the profanity (the second one on the google doc)
# seems to have some really outlandish terms (and some I've never even heard of), but does
# not have some standards ones like 'fuck', so I'm going to merge the other data set on the 
# the google doc so we can have some more standard insults included in the profanity list
# and not have to make more work for ourselves, as merging seems to be fairly simple

# r, c= np.where(df_posts.clean_comment == "fuck")

# print ((next(iter(df_posts.index[r]), 'no match'), next(iter(df_posts.columns[c]), 'no match')))
r= np.where(df_posts.clean_comment == "fuck")

print ((next(iter(df_posts.index[r]), 'no match')))

12802


In [87]:
# Cell is used for testing profanity, above cell finds the record with the profane terms
#Just for testing purposes
print("ORIGINAL TEXT:", df_posts['clean_comment'][12802])
print("\nCLEANED TEXT:", df_posts['clean_comment_profane_free'][12802])

ORIGINAL TEXT: fuck

CLEANED TEXT: fuck


In [193]:
df_posts.head()

Unnamed: 0,clean_comment,category,clean_comment_profane_free
0,family mormon have never tried explain them t...,1.0,
1,buddhism has very much lot compatible with chr...,1.0,
2,seriously don say thing first all they won get...,-1.0,
3,what you have learned yours and only yours wha...,0.0,
4,for your own benefit you may want read living ...,1.0,


In [177]:
df_reddit_data.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [178]:
df_twitter_data.head()

Unnamed: 0,clean_comment,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
