In [36]:
# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Binarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score

In [4]:
# Import Dataset
df = pd.read_csv('Clean Tweets.csv')
df.head()

Unnamed: 0,full_text,source,final_sources,text_final,Vader Sentiment,Vader Analysis
0,@lysisbunny @wheeingloss Dm me if you need hel...,"<a href=""http://twitter.com/download/iphone"" r...",twitter for iphone,lysisbunni wheeingloss dm need help bill cash app,0.4019,Neutral
1,RT @nope_thank_u: OR countless other Māori org...,"<a href=""http://twitter.com/download/iphone"" r...",twitter for iphone,rt nope thank u countless ori organis fight or...,0.1999,Neutral
2,RT @voicesofvv: Driving around to our usual sp...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",twitter web app,rt voicesofvv drive around usual spot weekli d...,0.3612,Neutral
3,RT @roshan_pie: I'm struggling financially rig...,"<a href=""http://twitter.com/download/iphone"" r...",twitter for iphone,rt roshan pie struggl financi right employ end...,0.0,Neutral
4,@babyygirlvenus Dm me if you need help with bi...,"<a href=""http://twitter.com/download/iphone"" r...",twitter for iphone,babyygirlvenu dm need help bill cash app,0.4019,Neutral


### I see that the first character of every "final source" feature is a space. I'd like to remove that

In [5]:
df['final_sources'] = df.final_sources.str[1:]
df.final_sources[0]

'twitter for iphone'

In [6]:
# Looking for NaN, there appear to be 2 NaN values in text_final because they were all stop-words

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2523 entries, 0 to 2522
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   full_text        2523 non-null   object 
 1   source           2523 non-null   object 
 2   final_sources    2523 non-null   object 
 3   text_final       2521 non-null   object 
 4   Vader Sentiment  2523 non-null   float64
 5   Vader Analysis   2523 non-null   object 
dtypes: float64(1), object(5)
memory usage: 69.1+ KB


In [7]:
df[df.text_final.isna()]

Unnamed: 0,full_text,source,final_sources,text_final,Vader Sentiment,Vader Analysis
574,My venmo and cashapp are both,"<a href=""http://jillresh.com"" rel=""nofollow"">j...",joey ebooks,,0.0,Neutral
1524,My venmo and cashapp are both,"<a href=""http://jillresh.com"" rel=""nofollow"">j...",joey ebooks,,0.0,Neutral


In [8]:
# filtering the DataFrame to be nonnull values and checking again

df = df[~df.text_final.isna()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2521 entries, 0 to 2522
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   full_text        2521 non-null   object 
 1   source           2521 non-null   object 
 2   final_sources    2521 non-null   object 
 3   text_final       2521 non-null   object 
 4   Vader Sentiment  2521 non-null   float64
 5   Vader Analysis   2521 non-null   object 
dtypes: float64(1), object(5)
memory usage: 88.6+ KB


In [9]:
# checking to see what the top 3 most frequent tweet sources are:

df.final_sources.value_counts().index[:3]


Index(['twitter for iphone', 'twitter for android', 'twitter web app'], dtype='object')

In [10]:
top_3 = df.final_sources.value_counts().index[:3]

df[df.final_sources.isin(top_3)].final_sources.value_counts()

twitter for iphone     1550
twitter for android     547
twitter web app         344
Name: final_sources, dtype: int64

In [11]:
# creating a new dataframe of just the tweets from the top 3 sources

df_top3 = df[df.final_sources.isin(top_3)]
df_top3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2441 entries, 0 to 2522
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   full_text        2441 non-null   object 
 1   source           2441 non-null   object 
 2   final_sources    2441 non-null   object 
 3   text_final       2441 non-null   object 
 4   Vader Sentiment  2441 non-null   float64
 5   Vader Analysis   2441 non-null   object 
dtypes: float64(1), object(5)
memory usage: 85.8+ KB


In [12]:
# creating a category feature identifying the tweet source

sources_map = {
    'twitter for iphone': 0,
    'twitter for android': 1,
    'twitter web app': 2
}
df_top3['labels'] = df_top3.final_sources.map(sources_map)
df_top3['labels'] = df_top3.labels.astype('category')
df_top3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top3['labels'] = df_top3.final_sources.map(sources_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top3['labels'] = df_top3.labels.astype('category')


Unnamed: 0,full_text,source,final_sources,text_final,Vader Sentiment,Vader Analysis,labels
0,@lysisbunny @wheeingloss Dm me if you need hel...,"<a href=""http://twitter.com/download/iphone"" r...",twitter for iphone,lysisbunni wheeingloss dm need help bill cash app,0.4019,Neutral,0
1,RT @nope_thank_u: OR countless other Māori org...,"<a href=""http://twitter.com/download/iphone"" r...",twitter for iphone,rt nope thank u countless ori organis fight or...,0.1999,Neutral,0
2,RT @voicesofvv: Driving around to our usual sp...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",twitter web app,rt voicesofvv drive around usual spot weekli d...,0.3612,Neutral,2
3,RT @roshan_pie: I'm struggling financially rig...,"<a href=""http://twitter.com/download/iphone"" r...",twitter for iphone,rt roshan pie struggl financi right employ end...,0.0,Neutral,0
4,@babyygirlvenus Dm me if you need help with bi...,"<a href=""http://twitter.com/download/iphone"" r...",twitter for iphone,babyygirlvenu dm need help bill cash app,0.4019,Neutral,0


In [13]:
df_top3.groupby('labels').count()

Unnamed: 0_level_0,full_text,source,final_sources,text_final,Vader Sentiment,Vader Analysis
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1550,1550,1550,1550,1550,1550
1,547,547,547,547,547,547
2,344,344,344,344,344,344


In [18]:
# create the corpus for TFIDF

corpus = df_top3['text_final'].values.astype('U')
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus)
y = df_top3['labels']

In [29]:
# Split the data into testing & training data sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [30]:
# Looking to make sure they are the same length

X_train.shape

(1952, 3305)

In [31]:
y_train.shape

(1952,)

In [42]:
# fit the training data with a Random Forest Classifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

RandomForestClassifier()

In [43]:
y_pred = classifier.predict(X_test)
y_prob = classifier.predict_proba(X_test)
print("ROC AUC: ", roc_auc_score(y_test, y_prob, multi_class='ovr'))
print("Classification Report:",)
print (classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test,y_pred))

ROC AUC:  0.8450481559034877
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.90      0.83       310
           1       0.58      0.49      0.53       110
           2       0.78      0.42      0.55        69

    accuracy                           0.74       489
   macro avg       0.71      0.60      0.64       489
weighted avg       0.73      0.74      0.73       489

Accuracy: 0.7402862985685071


In [44]:
y_pred = classifier.predict(X_test)
y_prob = classifier.predict_proba(X_test)
print("ROC AUC: ", roc_auc_score(y_test, y_prob, multi_class='ovo'))
print("Classification Report:",)
print (classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test,y_pred))

ROC AUC:  0.831989034808109
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.90      0.83       310
           1       0.58      0.49      0.53       110
           2       0.78      0.42      0.55        69

    accuracy                           0.74       489
   macro avg       0.71      0.60      0.64       489
weighted avg       0.73      0.74      0.73       489

Accuracy: 0.7402862985685071
