In [1]:
# import required libraries
import pandas as pd

In [5]:
# Read the datasets
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

In [6]:
train_df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [7]:
test_df.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [8]:
train_df.shape,test_df.shape

((31962, 3), (17197, 2))

In [9]:
train_df.columns,test_df.columns

(Index(['id', 'label', 'tweet'], dtype='object'),
 Index(['id', 'tweet'], dtype='object'))

In [10]:
# Date Cleaning
import re

def clean_text(df,text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    return df

train_clean_df = clean_text(train_df,"tweet")
test_clean_df = clean_text(test_df,"tweet")

In [19]:
# Handling imbalanced data
from sklearn.utils import resample
train_majority = train_clean_df[train_clean_df.label==0]
train_minority = train_clean_df[train_clean_df.label==1]

train_minority_upsampled = resample(train_minority, \
                                    replace=True, \
                                    n_samples=len(train_majority), \
                                    random_state=42)

train_upsampled = pd.concat([train_minority_upsampled,train_majority])

train_upsampled['label'].value_counts()

1    29720
0    29720
Name: label, dtype: int64

In [35]:
# creating a preprocessing pipeline with randomforest classifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
pipeline_rfc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', RandomForestClassifier()),])

In [36]:
# split the dataset into tarin and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'],
                                                    train_upsampled['label'],random_state = 42)

In [37]:
# fit the model
model = pipeline_rfc.fit(X_train, y_train)
y_predict = model.predict(X_test)

# calculate the f1 score
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.9972216575184658

In [38]:
# find out the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict)

array([[7461,   41],
       [   0, 7358]], dtype=int64)