In [42]:
# Import necessary libraries in this cell
import pandas as pd #pandas is a library for data wrangling/handling
import numpy as np #same case for numpy

# Libraries for helping us with strings
import string
# Regular Expression Library
import re

# Seaborn / matplotlib for visualization 
import seaborn as sns
# This command tells python to use seaborn for its styling.
sns.set()


# Matplotlib is also a very useful, basic visualization/plotting library
import matplotlib.pyplot as plt
# Very important, this will make your charts appear in your notebook instead of in a new window.
%matplotlib inline


# Provides z-score helper function,
# z-score uses standard deviation to remove outliers
# (industry standard is if a data point is 3 std devs away from mean,
# it's considered to be an outlier)
import scipy.stats as stats


# Ignore this, this is just for displaying images.
from IPython.display import Image


# Importing sklearn library
import sklearn

# Import the trees from sklearn
from sklearn import tree

# Metrics help us score our model, using metrics to evaluate our model
from sklearn import metrics

# Import our Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
# Import our Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

# Import our text vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# This is our Logit model
from sklearn.linear_model import LogisticRegression
# Importing our linear regression model
from sklearn.linear_model import LinearRegression

# Helper fuctions to evaluate our model from sklearn, including f1_score.
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score
# Some more helpful ML function
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import classification_report

# Helper function to split our data for testing and training purposes
from sklearn.model_selection import train_test_split
# Helper function for hyper-parameter turning.
from sklearn.model_selection import GridSearchCV

# Import MultinomaialNB classifier
from sklearn.naive_bayes import MultinomialNB

# Import our Decision Tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import SVC


# Library for visualizing our tree
# If you get an error, 
# run 'conda install python-graphviz' in your terminal (without the quotes).
import graphviz 


# NLTK is our Natural-Language-Took-Kit
import nltk

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet

# You may need to download these from nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
stopwords = stopwords.words('english')


import pickle

In [43]:
#First dataframe
df_reddit_data = pd.read_csv('data1/Reddit_Data.csv')
print("Number of rows and columns in dataframe: " + str(df_reddit_data.shape), '\n')
print(df_reddit_data.head())

print("*"*20, "\nIs null:")
print(df_reddit_data.isnull().sum(), '\n')
print("% of null and dup in data")
print(((df_reddit_data.isnull().sum() / len(df_reddit_data)) *100).round(2))
# Dropping nulls
df_reddit_data.dropna(inplace=True)

# Checking for duplicates
print("Number of dupes are", df_reddit_data.duplicated().sum(), "\n")

#Dropping dupes
df_reddit_data.drop_duplicates(inplace=True)

#checking dupes
print("total number of dupes: ",df_reddit_data.duplicated().sum())

# total number of rows:
print('\nNumber of rows after cleaning data: ', df_reddit_data.shape[0])

print(df_reddit_data.category.unique())

Number of rows and columns in dataframe: (37249, 2) 

                                       clean_comment  category
0   family mormon have never tried explain them t...         1
1  buddhism has very much lot compatible with chr...         1
2  seriously don say thing first all they won get...        -1
3  what you have learned yours and only yours wha...         0
4  for your own benefit you may want read living ...         1
******************** 
Is null:
clean_comment    100
category           0
dtype: int64 

% of null and dup in data
clean_comment    0.27
category         0.00
dtype: float64
Number of dupes are 350 

total number of dupes:  0

Number of rows after cleaning data:  36799
[ 1 -1  0]


In [26]:
# Checking for the type of columns, to see if some are faulty
# including duplicated columns or faulty ones that don't have an name for example
df_reddit_data.columns

Index(['clean_comment', 'category'], dtype='object')

In [27]:
#Second dataframe
df_twitter_data = pd.read_csv('data1/Twitter_Data.csv')
print("Number of rows and columns in dataframe: " + str(df_twitter_data.shape), '\n')
print(df_twitter_data.head())

print("*"*20, "\nIs null:")
print(df_twitter_data.isnull().sum(), '\n')
print("% of null and dup in data")
print(((df_twitter_data.isnull().sum() / len(df_twitter_data)) *100).round(2))
# Dropping nulls
df_twitter_data.dropna(inplace=True)

# Checking for duplicates
print("Number of dupes are", df_twitter_data.duplicated().sum(), "\n")

#Dropping dupes
df_twitter_data.drop_duplicates(inplace=True)

#checking dupes
print("total number of dupes: ",df_twitter_data.duplicated().sum())

# total number of rows:
print('\nNumber of rows after cleaning data: ', df_twitter_data.shape[0])

print(df_twitter_data.category.unique())

Number of rows and columns in dataframe: (162980, 2) 

                                          clean_text  category
0  when modi promised “minimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0
******************** 
Is null:
clean_text    4
category      7
dtype: int64 

% of null and dup in data
clean_text    0.0
category      0.0
dtype: float64
Number of dupes are 0 

total number of dupes:  0

Number of rows after cleaning data:  162969
[-1.  0.  1.]


In [28]:
#Fourth dataframe
# change underscores to hypens !!important
df_more_bad_words = pd.read_csv('data1/more-bad-words.csv', names=["clean_comment"])
df_bad_words = pd.read_csv("data1/bad-words.csv", names=["clean_comment"])

df_profanity = pd.concat([df_more_bad_words, df_bad_words], ignore_index=True)
df_profanity["category"] = -1

print(df_profanity.head())

# Checking for nulls
print(df_profanity.isnull().sum(), '\n')
print(((df_profanity.isnull().sum() / len(df_profanity)) *100).round(2))
df_profanity.dropna(inplace=True)

# Checking for duplicates
print("Number of dupes are " + str(df_profanity.duplicated().sum()))

#Dropping dupes

df_profanity.drop_duplicates(inplace=True)
print(df_profanity.duplicated().sum())
# Sanity Checking
print('Number of nulls: ' + str(df_profanity.isnull().sum()))
print("\nNumber of dupes are " + str(df_profanity.duplicated().sum()))
# print(str(df_reddit_data.duplicated()[condition]))
print('\nNumber of rows after cleaning data: ', df_profanity.shape[0])
print(df_profanity.category.unique())

#Saving the profanity separately as well, just in case
filename = 'data1/profanity.pkl'
pickle.dump(df_profanity, open(filename, "wb"))

  clean_comment  category
0     americunt        -1
1      as_hell         -1
2          ass         -1
3       asshole        -1
4       bastard        -1
clean_comment    0
category         0
dtype: int64 

clean_comment    0.0
category         0.0
dtype: float64
Number of dupes are 41
0
Number of nulls: clean_comment    0
category         0
dtype: int64

Number of dupes are 0

Number of rows after cleaning data:  1676
[-1]


In [29]:
# I ran this cell to merge the deepfake dataset with the reddit and twitter datasets

# merging the deep fake datasets
df_test = pd.read_csv("data1/test.csv")
df_train = pd.read_csv("data1/train.csv")
df_valid = pd.read_csv("data1/validation.csv")

df_test = df_test.drop(columns = ['screen_name', 'class_type'])
df_train = df_train.drop(columns = ['screen_name', 'class_type'])
df_valid = df_valid.drop(columns = ['screen_name', 'class_type'])

# have to run this cell again once i change the values in the second column
df_list = [df_test, df_train, df_valid]
# pd.concat() has a parameter (index_ignore) that will rid us of the problem a useless index
df_deepfake = pd.concat(df_list, ignore_index = True)

df_deepfake = df_deepfake.rename(columns = {'text' : 'clean_comment', 'account.type' : 'category'})

print(df_deepfake["category"].unique())
condition1 = df_deepfake['category'] == 'human'
condition2 = df_deepfake['category'] == 'bot'

df_deepfake.loc[condition1, 'category'] = 1
df_deepfake.loc[condition2, 'category'] = -1
print(df_deepfake["category"].unique())


#saved this as a separate dataframe here using pickle, just in case
filename = 'data1/deepfake.pkl'

pickle.dump(df_deepfake, open(filename, "wb"))

df_dpfk = pickle.load(open(filename, 'rb'))

['human' 'bot']
[1 -1]


In [30]:
print(df_dpfk.category.unique())

[1 -1]


In [31]:
# My attempt at merging the reddit and twitter datasets together
df_twitter_data.columns =["clean_comment", "category"]

list_of_df = [df_reddit_data, df_twitter_data, df_profanity, df_dpfk]

df_posts = pd.concat(list_of_df, ignore_index=True)

In [32]:
print(sum(len(a) for a in list_of_df))
print(str(df_posts.shape))
df_posts.head()

227016
(227016, 2)


Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [33]:
# Just some more sanity checking
print(df_posts.shape)
print(df_posts.columns)
print(df_posts.category.unique())

(227016, 2)
Index(['clean_comment', 'category'], dtype='object')
[1 -1 0]


In [34]:
print(str(df_profanity.shape))
print(len(df_profanity["clean_comment"].unique()))
df_profanity.head()

(1676, 2)
1676


Unnamed: 0,clean_comment,category
0,americunt,-1
1,as_hell,-1
2,ass,-1
3,asshole,-1
4,bastard,-1


In [35]:
# list of profane words that shouldn't be allowed
profanity_list = set(df_profanity["clean_comment"])

In [36]:
# Function to filer words out using profanity from df_bad_words
def remove_profanity(profane_str):
    words = word_tokenize(profane_str)
    valid_words = []
    for word in words:
        if word not in profanity_list:
            valid_words.append(word)
    profane_str = ' '.join(valid_words)
    return profane_str

In [37]:
#Seems like good practice to me, creating a single function that will call all
# our necessary functions from one place, will be subject to change
translator = str.maketrans('', '', string.punctuation)

def text_pipeline(input_str):
    input_str = remove_profanity(input_str)
    return input_str

def mk_lower(a):
    return a.lower()

def remove_stopwords(a):
    return " ".join([word for word in word_tokenize(a) if word not in stopwords])

def remove_sp_char(a):
    ## \s for white space, ^ is negation, \w is words.  so replace all punctutation that follows a word 
    return a.translate(translator)

def remove_sp_char2(a):
    return re.sub(r"[^\w\s]", "", a)
    
    
def text_pipeline2(a):
    a = mk_lower(a)
    a = remove_sp_char(a)
    a = remove_stopwords(a)
    return a

In [39]:
# Two columns, both filtered, but one has profanity while the other does not
df_posts["clean_comment"] = df_posts['clean_comment'].apply(text_pipeline2)
df_posts['clean_comment_profane_free'] = df_posts['clean_comment'].apply(text_pipeline)

In [40]:
# clean_comment is the original value of everything, unfiltered
# clean_comment2 is the clean_comment column but filtered through a pipeline of functions that filter the text
# clean_comment_profane_free is the clean_comment2 column applied with an additional (profanity) filter
df_posts.head()

Unnamed: 0,clean_comment,category,clean_comment_profane_free
0,family mormon never tried explain still stare ...,1,family never tried explain still stare puzzled...
1,buddhism much lot compatible christianity espe...,1,buddhism much lot compatible christianity espe...
2,seriously say thing first get complex explain ...,-1,seriously say thing first get complex explain ...
3,learned want teach different focus goal wrappi...,0,learned want teach different focus goal wrappi...
4,benefit may want read living buddha living chr...,1,benefit may want read living buddha living thi...


In [41]:
# NAME YOUR MODEL 
filename = 'data1/comments-df.pkl'

# EXPORT AND SAVE df
pickle.dump(df_posts, open(filename, "wb"))

## HOW TO LOAD IT FOR FUTURE USE
df = pickle.load(open(filename, 'rb'))


In [21]:
print(df)

                                            clean_comment category  \
0       family mormon never tried explain still stare ...        1   
1       buddhism much lot compatible christianity espe...        1   
2       seriously say thing first get complex explain ...       -1   
3       learned want teach different focus goal wrappi...        0   
4       benefit may want read living buddha living chr...        1   
...                                                   ...      ...   
227011   youre going even prouder dont clue trade believe       -1   
227012              httpstco10xkzxdbcf httpstcociuiyweb45        1   
227013  2 “ take place people live head favorite coffe...        1   
227014  black like company need someone ready also rea...       -1   
227015  guys hate facebook facebook ad campaign made h...       -1   

                               clean_comment_profane_free  
0       family never tried explain still stare puzzled...  
1       buddhism much lot compatible ch

In [44]:
# NAME YOUR MODEL 
filename = 'data1/comments-df.pkl'
## HOW TO LOAD IT FOR FUTURE USE
df = pickle.load(open(filename, 'rb'))


In [45]:
df.head()

Unnamed: 0,clean_comment,category,clean_comment_profane_free
0,family mormon never tried explain still stare ...,1,family never tried explain still stare puzzled...
1,buddhism much lot compatible christianity espe...,1,buddhism much lot compatible christianity espe...
2,seriously say thing first get complex explain ...,-1,seriously say thing first get complex explain ...
3,learned want teach different focus goal wrappi...,0,learned want teach different focus goal wrappi...
4,benefit may want read living buddha living chr...,1,benefit may want read living buddha living thi...


In [52]:
# Two columns, both filtered, but one has profanity while the other does not
df = df.drop(columns=['clean_comment_profane_free'])

In [53]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal wrappi...,0
4,benefit may want read living buddha living chr...,1


In [54]:
# NAME YOUR MODEL 
filename = 'data1/comments-df.pkl'

# EXPORT AND SAVE df
pickle.dump(df, open(filename, "wb"))

In [55]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal wrappi...,0
4,benefit may want read living buddha living chr...,1
