## Importing the necessary Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix,f1_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re

In [2]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
train.shape

(31962, 3)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [5]:
train['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [6]:
train.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

## Processing the tweets to get Clean texts

In [7]:
def process_tweet(tweet):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",tweet.lower()).split())

In [8]:
train['cleaned_tweets'] = train['tweet'].apply(process_tweet)
train.head()

Unnamed: 0,id,label,tweet,cleaned_tweets
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i can t use cause they ...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now motivation


## Spliting the  data into training and testing sets

In [9]:
from sklearn.model_selection import train_test_split
X = train['cleaned_tweets']
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = None)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_vect = CountVectorizer(stop_words='english')
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

In [11]:
x_train_counts = count_vect.fit_transform(X_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

In [12]:
x_train_counts.shape, x_train_tfidf.shape

((25569, 33731), (25569, 33731))

In [13]:
x_test_counts = count_vect.transform(X_test)
x_test_tfidf = transformer.transform(x_test_counts)

### Logistic Regression

In [14]:
#logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=42)
logreg.fit(x_train_tfidf,y_train)
predict_log = logreg.predict(x_test_tfidf)

In [15]:
print(confusion_matrix(y_test,predict_log))
print(classification_report(y_test, predict_log))

[[5942   13]
 [ 304  134]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5955
           1       0.91      0.31      0.46       438

    accuracy                           0.95      6393
   macro avg       0.93      0.65      0.72      6393
weighted avg       0.95      0.95      0.94      6393



In [16]:
accuracy_score(y_test, predict_log)*100

95.04145158767402

### Random Forest

In [17]:
#random forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=500)
rfc.fit(x_train_tfidf,y_train)
predict_rfc = rfc.predict(x_test_tfidf)

In [18]:
print(confusion_matrix(y_test,predict_rfc))
print(classification_report(y_test, predict_rfc))

[[5924   31]
 [ 198  240]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5955
           1       0.89      0.55      0.68       438

    accuracy                           0.96      6393
   macro avg       0.93      0.77      0.83      6393
weighted avg       0.96      0.96      0.96      6393



In [19]:
accuracy_score(y_test, predict_rfc)*100

96.41795714062256

### Support Vector Machine

In [20]:
#SVM Model
from sklearn import svm
lin_clf = svm.LinearSVC()
lin_clf.fit(x_train_tfidf,y_train)
predict_svm = lin_clf.predict(x_test_tfidf)

In [21]:
print(confusion_matrix(y_test,predict_svm))
print(classification_report(y_test, predict_svm))

[[5909   46]
 [ 164  274]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5955
           1       0.86      0.63      0.72       438

    accuracy                           0.97      6393
   macro avg       0.91      0.81      0.85      6393
weighted avg       0.96      0.97      0.96      6393



In [22]:
accuracy_score(y_test, predict_svm)*100

96.715157203191

## Testing the given Test dataset

In [23]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [24]:
test.shape

(17197, 2)

In [25]:
test['cleaned_tweets'] = test['tweet'].apply(process_tweet)
test.head()

Unnamed: 0,id,tweet,cleaned_tweets
0,31963,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedication...
1,31964,@user #white #supremacists want everyone to s...,white supremacists want everyone to see the ne...
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your acne altwaystoheal heal...
3,31966,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",3rd bihday to my amazing hilarious nephew eli ...


In [26]:
X = test['cleaned_tweets']
x_test_counts = count_vect.transform(X)
x_test_tfidf = transformer.transform(x_test_counts)

In [27]:
test['predict_lr'] = logreg.predict(x_test_tfidf)
test['predict_rfc'] = rfc.predict(x_test_tfidf)
test['predict_svm'] = lin_clf.predict(x_test_tfidf)

In [28]:
test

Unnamed: 0,id,tweet,cleaned_tweets,predict_lr,predict_rfc,predict_svm
0,31963,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedication...,0,0,0
1,31964,@user #white #supremacists want everyone to s...,white supremacists want everyone to see the ne...,0,1,1
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your acne altwaystoheal heal...,0,0,0
3,31966,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,0,0,0
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",3rd bihday to my amazing hilarious nephew eli ...,0,0,0
...,...,...,...,...,...,...
17192,49155,thought factory: left-right polarisation! #tru...,thought factory left right polarisation trump ...,1,1,1
17193,49156,feeling like a mermaid ð #hairflip #neverre...,feeling like a mermaid hairflip neverready for...,0,0,0
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,hillary campaigned today in ohio omg amp used ...,0,0,0
17195,49158,"happy, at work conference: right mindset leads...",happy at work conference right mindset leads t...,0,0,0


## Storing the result in csv file

In [29]:
file_name = "test_report.csv"
test.to_csv(file_name,index=False)

In [30]:
report = pd.read_csv("test_report.csv")
report.head(50)

Unnamed: 0,id,tweet,cleaned_tweets,predict_lr,predict_rfc,predict_svm
0,31963,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedication...,0,0,0
1,31964,@user #white #supremacists want everyone to s...,white supremacists want everyone to see the ne...,0,1,1
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your acne altwaystoheal heal...,0,0,0
3,31966,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...,0,0,0
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",3rd bihday to my amazing hilarious nephew eli ...,0,0,0
5,31968,choose to be :) #momtips,choose to be momtips,0,0,0
6,31969,something inside me dies ð¦ð¿â¨ eyes nes...,something inside me dies eyes ness smokeyeyes ...,0,0,0
7,31970,#finished#tattoo#inked#ink#loveitâ¤ï¸ #â¤ï¸...,finished tattoo inked ink loveit thanks aleeee,0,0,0
8,31971,@user @user @user i will never understand why...,i will never understand why my dad left me whe...,0,0,0
9,31972,#delicious #food #lovelife #capetown mannaep...,delicious food lovelife capetown mannaepicure ...,0,0,0


In [31]:
# Conclusion

# Algorithms                                  Accuracy

#Logistic Regression  --                         95%

#Random Forest        --                         96%

#SVM                  --                         97%

