In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Read data files

In [2]:
traindf = pd.read_csv('data/train.csv')
testdf = pd.read_csv('data/test.csv')

### Check original data

In [3]:
traindf.shape, testdf.shape

((31962, 3), (17197, 2))

#### Drop Duplicates

In [4]:
traindf.drop_duplicates(inplace = True)

In [5]:
testdf.drop_duplicates(inplace = True)

In [6]:
traindf.shape, testdf.shape

((31962, 3), (17197, 2))

In [7]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 998.8+ KB


In [8]:
traindf['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [9]:
traindf.head(3)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty


In [10]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 403.1+ KB


In [11]:
testdf.head(3)

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...


### Cleaning tweets

In [12]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Clean train data set

In [13]:
#create empty list
corpus = []
# loop thru our train dataset 
for i in range (0, len(traindf)):
    tweet = traindf['tweet'][i]
    tweet = tweet.lower()
    tweet = re.sub('[^a-zA-Z]', ' ', tweet) #only alphabet
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER',  tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = tweet.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')+ list(punctuation) + ['AT_USER','URL', 'user']
    tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = ' '.join(tweet)
    corpus.append(tweet)


### Cleaning test data set 

In [14]:
#create empty list
corpus_test = []
# loop thru our train dataset 
for i in range (0, len(testdf)):
    tweet = testdf['tweet'][i]
    tweet = tweet.lower()
    tweet = re.sub('[^a-zA-Z]', ' ', tweet) #only alphabet
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER',  tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = tweet.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')+ list(punctuation) + ['AT_USER','URL', 'user']
    tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = ' '.join(tweet)
    corpus_test.append(tweet)

In [15]:
len(corpus), len(corpus_test)

(31962, 17197)

In [16]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 2.2+ MB


In [17]:
traindf['cleaned'] = np.array(corpus)

In [18]:
train = traindf.drop(columns=['id', 'tweet'])

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    31962 non-null  int64 
 1   cleaned  31962 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.0+ MB


In [20]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


In [21]:
testdf['cleaned'] = np.array(corpus_test)

In [22]:
test = testdf.drop(columns=['id', 'tweet'])

In [23]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17197 entries, 0 to 17196
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   cleaned  17197 non-null  object
dtypes: object(1)
memory usage: 908.7+ KB


In [24]:
train.columns, test.columns

(Index(['label', 'cleaned'], dtype='object'),
 Index(['cleaned'], dtype='object'))

In [25]:
type(train)

pandas.core.frame.DataFrame

In [26]:
non_hate = train[train['label'] == 0]
non_hate.shape

(29720, 2)

In [27]:
hate = train[train['label'] == 1]
hate.shape

(2242, 2)

## Balancing Train dataset Up-sampling Minority Class

https://elitedatascience.com/imbalanced-classes

In [28]:
from sklearn.utils import resample

In [29]:
#rename dfs with majority(non_hate) and nimority(hate)
train_majority = train[train['label'] == 0]
train_minority = train[train['label'] ==1]

In [30]:
#Upsample minority
train_minority_upsampled = resample (train_minority, replace=True, #sample with replacement
                                     n_samples=len(train_majority),# to match majority class
                                     random_state=42) # reproducible results 

In [31]:
#Concatanate train_minority_upsampled to train_majority
train_upsampled = pd.concat([train_minority_upsampled, train_majority])
#Display new class counts
train_upsampled['label'].value_counts()

1    29720
0    29720
Name: label, dtype: int64

In [32]:
train_upsampled.shape, train_upsampled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59440 entries, 12213 to 31961
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    59440 non-null  int64 
 1   cleaned  59440 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.4+ MB


((59440, 2), None)

# Extracting Features

In [33]:
train_upsampled.columns, test.columns, train_upsampled.shape, test.shape

(Index(['label', 'cleaned'], dtype='object'),
 Index(['cleaned'], dtype='object'),
 (59440, 2),
 (17197, 1))

### 1. Bag of Words BOW

In [34]:
#train and test sets
from sklearn.feature_extraction.text import CountVectorizer

bow_cv = CountVectorizer(max_features=1000)
train_upsampled_bow = bow_cv.fit_transform(train_upsampled['cleaned']).toarray()
test_bow = bow_cv.fit_transform(test['cleaned']).toarray()

In [35]:
train_upsampled_bow.shape, test_bow.shape

((59440, 1000), (17197, 1000))

# Build the models

### Cross Validating Train Set

In [36]:
from sklearn.model_selection import train_test_split
X_train_bow, X_val_bow, y_train, y_val = train_test_split(train_upsampled_bow, train_upsampled['label'], test_size = 0.2, random_state = 42)

##### BOW Features

In [37]:
X_train_bow.shape, X_val_bow.shape, y_train.shape, y_val.shape

((47552, 1000), (11888, 1000), (47552,), (11888,))

### Random Forest 

In [38]:
from sklearn.ensemble import RandomForestClassifier

### 1. BOW Features

In [40]:
rf_bow = RandomForestClassifier(n_estimators=400, random_state=11).fit(X_train_bow, y_train)
prediction_bow = rf_bow.predict(X_val_bow)

In [41]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
print(f"F1 score : {f1_score(y_val, prediction_bow)}")
print(f"Training Data Score: {rf_bow.score(X_train_bow, y_train)}")
print(f"Validation Data Score: {rf_bow.score(X_val_bow, y_val)}")
print(classification_report(y_val, prediction_bow))

F1 score : 0.9703102784979509
Training Data Score: 0.9913988896366084
Validation Data Score: 0.9701379542395693
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      6008
           1       0.95      0.99      0.97      5880

    accuracy                           0.97     11888
   macro avg       0.97      0.97      0.97     11888
weighted avg       0.97      0.97      0.97     11888



In [53]:
!pip install joblib



In [55]:
import joblib