In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Read data files

In [6]:
traindf = pd.read_csv('data/train.csv')
testdf = pd.read_csv('data/test.csv')

### Check original data

In [7]:
traindf.shape, testdf.shape

((31962, 3), (17197, 2))

#### Drop Duplicates

In [8]:
traindf.drop_duplicates(inplace = True)

In [9]:
testdf.drop_duplicates(inplace = True)

In [10]:
traindf.shape, testdf.shape

((31962, 3), (17197, 2))

In [11]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 998.8+ KB


In [12]:
traindf['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [13]:
traindf.head(3)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty


In [14]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 403.1+ KB


In [15]:
testdf.head(3)

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...


### Cleaning tweets

In [16]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Clean train data set

In [17]:
#create empty list
corpus = []
# loop thru our train dataset 
for i in range (0, len(traindf)):
    tweet = traindf['tweet'][i]
    tweet = tweet.lower()
    tweet = re.sub('[^a-zA-Z]', ' ', tweet) #only alphabet
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER',  tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = tweet.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')+ list(punctuation) + ['AT_USER','URL', 'user']
    tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = ' '.join(tweet)
    corpus.append(tweet)


### Cleaning test data set 

In [18]:
#create empty list
corpus_test = []
# loop thru our train dataset 
for i in range (0, len(testdf)):
    tweet = testdf['tweet'][i]
    tweet = tweet.lower()
    tweet = re.sub('[^a-zA-Z]', ' ', tweet) #only alphabet
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER',  tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = tweet.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')+ list(punctuation) + ['AT_USER','URL', 'user']
    tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = ' '.join(tweet)
    corpus_test.append(tweet)

In [19]:
len(corpus), len(corpus_test)

(31962, 17197)

In [20]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 2.2+ MB


In [21]:
traindf['cleaned'] = np.array(corpus)

In [22]:
train = traindf.drop(columns=['id', 'tweet'])

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31962 entries, 0 to 31961
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    31962 non-null  int64 
 1   cleaned  31962 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.0+ MB


In [24]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


In [25]:
testdf['cleaned'] = np.array(corpus_test)

In [26]:
test = testdf.drop(columns=['id', 'tweet'])

In [27]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17197 entries, 0 to 17196
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   cleaned  17197 non-null  object
dtypes: object(1)
memory usage: 908.7+ KB


In [28]:
train.columns, test.columns

(Index(['label', 'cleaned'], dtype='object'),
 Index(['cleaned'], dtype='object'))

In [29]:
type(train)

pandas.core.frame.DataFrame

In [30]:
non_hate = train[train['label'] == 0]
non_hate.shape

(29720, 2)

In [31]:
hate = train[train['label'] == 1]
hate.shape

(2242, 2)

# Extracting Features

In [32]:
train.columns, test.columns, train.shape, test.shape

(Index(['label', 'cleaned'], dtype='object'),
 Index(['cleaned'], dtype='object'),
 (31962, 2),
 (17197, 1))

### 2. Tf-Idf

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
train_tfidf = tfidf_vectorizer.fit_transform(train['cleaned']).toarray()
test_tfidf = tfidf_vectorizer.fit_transform(test['cleaned']).toarray()

In [34]:
train_tfidf.shape, test_tfidf.shape

((31962, 1000), (17197, 1000))

# Build the models

1. Penalized SVM


In [35]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report

### Cross Validating Train Set

In [40]:
from sklearn.model_selection import train_test_split
X_train_tfidf, X_val_tfidf, y_train, y_val = train_test_split(train_tfidf, train['label'], test_size = 0.2, random_state = 42)

### SVM Support Vector Machine

In [42]:
from sklearn import svm

### Tf-Idf Features

In [43]:
svc = svm.SVC(kernel='linear', class_weight='balanced', probability=True).fit(X_train_tfidf, y_train)

prediction_smv = svc.predict_proba(X_val_tfidf)
prediction_int = prediction_smv[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)
f1_score(y_val, prediction_int)

0.5232163080407701

In [45]:
print(f"F1 score : {f1_score(y_val, prediction_int)}")
print(f"Training Data Score: {svc.score(X_train_tfidf, y_train)}")
print(f"Validation Data Score: {svc.score(X_val_tfidf, y_val)}")
print(classification_report(y_val, prediction_int))

F1 score : 0.5232163080407701
Training Data Score: 0.8782510070788846
Validation Data Score: 0.8560926012826529
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      5937
           1       0.54      0.51      0.52       456

    accuracy                           0.93      6393
   macro avg       0.75      0.74      0.74      6393
weighted avg       0.93      0.93      0.93      6393



In [46]:
import joblib

In [47]:
filename = 'svm_tfidf_penalized.pkl'
joblib.dump(svc, filename)

['svm_tfidf_penalized.pkl']