In [1]:
!pip install tweepy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install geocoder

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
! pip install pyspellchecker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
# Importing necessary libraries
import pandas as pd
import warnings
import requests
import tweepy
import geocoder
warnings.filterwarnings('ignore')
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import  StandardScaler 
from sklearn import metrics as mt
from spellchecker import SpellChecker 
import nltk
nltk.download('punkt')
from nltk import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.model_selection import GridSearchCV
import re
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
#from sklearn.datasets import make_classification

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
# Loading the training dataset
data = pd.read_csv("labeled_data.csv", on_bad_lines='skip')
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [6]:
data.shape

(24783, 7)

In [7]:
# Checking whether we have any null values or not in our dataset
data.isna().sum()

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

**Data Pre-processing**

In [8]:
# Dropping the non-essential features
data.drop(data.columns[[0,1,2,3,4]], axis=1, inplace=True)
data.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [9]:
# Removing rows with hate speech labelling
data = data[data['class'] != 0]

In [10]:
data.shape

(23353, 2)

In [11]:
# Re-labelling the data where 0 represents non-abusive and 1 represents abusive
data.loc[data['class']>1,'label'] = 0
data.loc[data['class']<=1,'label'] = 1
data.drop(data.columns[[0]], axis=1, inplace=True)
data.head()

Unnamed: 0,tweet,label
0,!!! RT @mayasolovely: As a woman you shouldn't...,0.0
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1.0
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1.0
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1.0
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1.0


In [12]:
# Coverting the entire dataset to lower-case
data['tweet'] = data['tweet'].str.lower()
data.head()

Unnamed: 0,tweet,label
0,!!! rt @mayasolovely: as a woman you shouldn't...,0.0
1,!!!!! rt @mleew17: boy dats cold...tyga dwn ba...,1.0
2,!!!!!!! rt @urkindofbrand dawg!!!! rt @80sbaby...,1.0
3,!!!!!!!!! rt @c_g_anderson: @viva_based she lo...,1.0
4,!!!!!!!!!!!!! rt @shenikaroberts: the shit you...,1.0


In [13]:
# Replacing multiple spaces from the dataset to a single space
def removeSpace(tweet):
  return " ".join(tweet.split())

data['tweet'] = data['tweet'].apply(removeSpace)

In [14]:
data.head()

Unnamed: 0,tweet,label
0,!!! rt @mayasolovely: as a woman you shouldn't...,0.0
1,!!!!! rt @mleew17: boy dats cold...tyga dwn ba...,1.0
2,!!!!!!! rt @urkindofbrand dawg!!!! rt @80sbaby...,1.0
3,!!!!!!!!! rt @c_g_anderson: @viva_based she lo...,1.0
4,!!!!!!!!!!!!! rt @shenikaroberts: the shit you...,1.0


In [15]:
# Tokenize the entire tweet column of the dataset
data['tweet'] = data['tweet'].apply(lambda x: word_tokenize(x))

In [16]:
data.head()

Unnamed: 0,tweet,label
0,"[!, !, !, rt, @, mayasolovely, :, as, a, woman...",0.0
1,"[!, !, !, !, !, rt, @, mleew17, :, boy, dats, ...",1.0
2,"[!, !, !, !, !, !, !, rt, @, urkindofbrand, da...",1.0
3,"[!, !, !, !, !, !, !, !, !, rt, @, c_g_anderso...",1.0
4,"[!, !, !, !, !, !, !, !, !, !, !, !, !, rt, @,...",1.0


In [17]:
# #Rectifies the spelling mistakes

# def rectifySpelling(tweet):
#   sp = SpellChecker()
#   output = []
#   for line in tweet:
#     output.append(sp.correction(line))
#   return output


# data['tweet'] = data['tweet'].apply(rectifySpelling)
# data.head()

In [18]:
# Removing the most common words like prepositions , articles , etc.
stopwords = stopwords.words('english')
def removeWords(tweet):
  
  output = []
  for word in tweet:
    if word not in stopwords:
      output.append(word)
  return output

data['tweet'] = data['tweet'].apply(removeWords)

In [19]:
data.head()

Unnamed: 0,tweet,label
0,"[!, !, !, rt, @, mayasolovely, :, woman, n't, ...",0.0
1,"[!, !, !, !, !, rt, @, mleew17, :, boy, dats, ...",1.0
2,"[!, !, !, !, !, !, !, rt, @, urkindofbrand, da...",1.0
3,"[!, !, !, !, !, !, !, !, !, rt, @, c_g_anderso...",1.0
4,"[!, !, !, !, !, !, !, !, !, !, !, !, !, rt, @,...",1.0


In [20]:
# Removing the punctuation marks from the tweet column of the dataset
def removePunctuation(tweet):
  token = RegexpTokenizer(r"\w+")
  return token.tokenize(' '.join(tweet))

data['tweet'] = data['tweet'].apply(removePunctuation)

In [21]:
data.head()

Unnamed: 0,tweet,label
0,"[rt, mayasolovely, woman, n, t, complain, clea...",0.0
1,"[rt, mleew17, boy, dats, cold, tyga, dwn, bad,...",1.0
2,"[rt, urkindofbrand, dawg, rt, 80sbaby4life, ev...",1.0
3,"[rt, c_g_anderson, viva_based, look, like, tra...",1.0
4,"[rt, shenikaroberts, shit, hear, might, true, ...",1.0


In [22]:
# Finding the most frequent words in the dataset
def freqWords(tweet):
  lines = tweet.values
  output = []
  for line in lines:
    output+=line[0]
  return FreqDist(output).most_common(10)

wordFreq = freqWords(data)
print(wordFreq)

[('bitch', 8139), ('rt', 7319), ('t', 6525), ('n', 3872), ('128514', 3145), ('bitches', 3060), ('co', 2884), ('http', 2798), ('like', 2618), ('s', 2378)]


In [23]:
# Removing the most frequent words in the dataset except the ones which are abusive
def removeFreqWord(tweet):
  output = []
  for line in tweet:
    if line not in frequencies:
      output.append(line)
  return output
  

frequencies = []
for word, freq in wordFreq:
  if word=='bitch' or word=='bitches':
    pass
  else:
    frequencies.append(freq)
print(frequencies)

[7319, 6525, 3872, 3145, 2884, 2798, 2618, 2378]


In [24]:
data['tweet'] = data['tweet'].apply(removeFreqWord)

In [25]:
data.head()

Unnamed: 0,tweet,label
0,"[rt, mayasolovely, woman, n, t, complain, clea...",0.0
1,"[rt, mleew17, boy, dats, cold, tyga, dwn, bad,...",1.0
2,"[rt, urkindofbrand, dawg, rt, 80sbaby4life, ev...",1.0
3,"[rt, c_g_anderson, viva_based, look, like, tra...",1.0
4,"[rt, shenikaroberts, shit, hear, might, true, ...",1.0


In [26]:
# Converting the verbs into its root form using Lemmatizer
def lemmatize(tweet):
  output = []
  lst = ['a','r','n','v']
  wordNetLemmatizer = WordNetLemmatizer()
  posTag = pos_tag(tweet)
  
  for word, tag in posTag:
    pos = tag[0].lower()

    if pos not in lst:
      pos = 'n'
    output.append(wordNetLemmatizer.lemmatize(word,pos))
  return output

data['tweet'] = data['tweet'].apply(lemmatize)

In [27]:
data.head()

Unnamed: 0,tweet,label
0,"[rt, mayasolovely, woman, n, t, complain, clea...",0.0
1,"[rt, mleew17, boy, dat, cold, tyga, dwn, bad, ...",1.0
2,"[rt, urkindofbrand, dawg, rt, 80sbaby4life, ev...",1.0
3,"[rt, c_g_anderson, viva_based, look, like, tra...",1.0
4,"[rt, shenikaroberts, shit, hear, might, true, ...",1.0


In [28]:
# Removing HTML tags from the dataset
def removeTag(tweet):
  tweet=' '.join(tweet)
  return re.compile('<.*?>').sub(r'',tweet)

data['tweet'] = data['tweet'].apply(removeTag)

In [29]:
# def removeURL(s):
#   df['result'] = df['result'].str.replace(r'\D', '')
#   return re.compile(r'https?://\S+|www\.\S+').sub(r'',tweet)

#data['tweet'] = data['tweet'].apply(removeURL)

In [30]:
# Removing the substrings where http is present
data['tweet'] = data['tweet'].str.replace(r'http', '')

In [31]:
# Dividing the dataset into features and target
x= data['tweet']
y = data['label']

print(x.shape)
print(y.shape)

(23353,)
(23353,)


In [32]:
# Using Train_test_split to split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)

(16347,)
(7006,)


In [33]:
# Function to calculate various metrics
def metric_calculation(y_test,y_pred):
  print("Accuracy  :  ",mt.accuracy_score(y_test,y_pred))
  print("Precision :  ",mt.precision_score(y_test,y_pred))
  print("Recall    :  ",mt.recall_score(y_test,y_pred))
  print("F1-score  :  ",mt.f1_score(y_test,y_pred))
  print("Confusion matrix      : \n ",mt.confusion_matrix(y_test,y_pred))
  print("Classification report : \n",mt.classification_report(y_test,y_pred))

In [34]:
# Transforming a text into a vector on the basis of the frequency of each word
cv=CountVectorizer()
x_train=cv.fit_transform(X_train)
x_test = cv.transform(X_test)

print(x_train.shape)
print(x_test.shape)

(16347, 24867)
(7006, 24867)


In [35]:
# Typecasting to integer
y_train = y_train.astype('int')

In [36]:
# Using Naive Bayes to train a model and testing its accuracy
nb = MultinomialNB()
nb.fit(x_train,y_train)
y_pred = nb.predict(x_test)
metric_calculation(y_test,y_pred)

Accuracy  :   0.9132172423636883
Precision :   0.9148833467417539
Recall    :   0.9862966175195144
F1-score  :   0.9492487479131887
Confusion matrix      : 
  [[ 712  529]
 [  79 5686]]
Classification report : 
               precision    recall  f1-score   support

         0.0       0.90      0.57      0.70      1241
         1.0       0.91      0.99      0.95      5765

    accuracy                           0.91      7006
   macro avg       0.91      0.78      0.83      7006
weighted avg       0.91      0.91      0.91      7006



In [37]:
# Using SVM to train a model and testing its accuracy
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=21)
cv=CountVectorizer()
x_train=cv.fit_transform(X_train)
x_test = cv.transform(X_test)
svc=SVC()
svc.fit(x_train,y_train)
y_pred_svm=svc.predict(x_test)
metric_calculation(y_test,y_pred_svm)

Accuracy  :   0.95992464463093
Precision :   0.9861907796898237
Recall    :   0.9648721679484514
F1-score  :   0.9754150031519226
Confusion matrix      : 
  [[ 963   65]
 [ 169 4642]]
Classification report : 
               precision    recall  f1-score   support

         0.0       0.85      0.94      0.89      1028
         1.0       0.99      0.96      0.98      4811

    accuracy                           0.96      5839
   macro avg       0.92      0.95      0.93      5839
weighted avg       0.96      0.96      0.96      5839



In [38]:
# Using Grid Search to find the best parameters for DecisionTree Classifier
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=31)
cv=CountVectorizer()
x_train=cv.fit_transform(X_train)
x_test = cv.transform(X_test)
max_features = ['auto', 'sqrt', 'log2']
max_depth = [5,6,7,8]
criterion = ['gini', 'entropy']
dt = DecisionTreeClassifier()
gs_dt = GridSearchCV(dt, param_grid = { 'max_features':max_features,'max_depth':max_depth,'criterion':criterion})
gs_dt.fit(x_train,y_train)

gs_dt.best_params_

{'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto'}

In [39]:
# Using the above estimators to train a Decision Tree model and testing its accuracy
dt_best= DecisionTreeClassifier(max_depth=8, max_features='auto')
dt_best.fit(x_train,y_train)
y_pred_dt=dt_best.predict(x_test)

metric_calculation(y_test,y_pred_dt)

Accuracy  :   0.8277102243534852
Precision :   0.8276748971193416
Recall    :   0.9997928748964374
F1-score  :   0.9056285178236397
Confusion matrix      : 
  [[   6 1005]
 [   1 4827]]
Classification report : 
               precision    recall  f1-score   support

         0.0       0.86      0.01      0.01      1011
         1.0       0.83      1.00      0.91      4828

    accuracy                           0.83      5839
   macro avg       0.84      0.50      0.46      5839
weighted avg       0.83      0.83      0.75      5839



In [40]:
# Using Grid Search to find the best parameters for RandomForest Classifier
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
cv=CountVectorizer()
x_train=cv.fit_transform(X_train)
x_test = cv.transform(X_test)
# rf= RandomForestClassifier()
# n_estimators = [200, 500]
# max_features = ['auto', 'sqrt', 'log2']
# max_depth = [5,7]
# criterion = ['gini', 'entropy']
# gs = GridSearchCV(rf, param_grid = {'n_estimators':n_estimators, 'max_features':max_features,'max_depth':max_depth,'criterion':criterion})
# gs.fit(x_train,y_train)

# gs.best_params_

In [41]:
# Using the above estimators to train a RandomForest model and testing its accuracy

rf_best= RandomForestClassifier(max_depth=5, n_estimators=200)
rf_best.fit(x_train,y_train)
y_pred_rf=rf_best.predict(x_test)

metric_calculation(y_test,y_pred_rf)

Accuracy  :   0.8238064654249625
Precision :   0.8238064654249625
Recall    :   1.0
F1-score  :   0.9033924169503463
Confusion matrix      : 
  [[   0  823]
 [   0 3848]]
Classification report : 
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       823
         1.0       0.82      1.00      0.90      3848

    accuracy                           0.82      4671
   macro avg       0.41      0.50      0.45      4671
weighted avg       0.68      0.82      0.74      4671



In [42]:
# Using Grid Search to find the best parameters for Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
cv=CountVectorizer()
x_train=cv.fit_transform(X_train)
x_test = cv.transform(X_test)
grid={ "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(x_train,y_train)

GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'penalty': ['l1', 'l2']})

In [43]:
logreg_cv.best_params_

{'penalty': 'l2'}

In [44]:
# Using the above estimators to train a Logistic Regression model and testing its accuracy
logreg_best= LogisticRegression(penalty= 'l2')
logreg_best.fit(x_train,y_train)
y_pred_logreg=logreg_best.predict(x_test)

metric_calculation(y_test,y_pred_logreg)

Accuracy  :   0.9633911368015414
Precision :   0.9865043662344536
Recall    :   0.9688149688149689
F1-score  :   0.9775796512390194
Confusion matrix      : 
  [[ 772   51]
 [ 120 3728]]
Classification report : 
               precision    recall  f1-score   support

         0.0       0.87      0.94      0.90       823
         1.0       0.99      0.97      0.98      3848

    accuracy                           0.96      4671
   macro avg       0.93      0.95      0.94      4671
weighted avg       0.97      0.96      0.96      4671



In [45]:
# Using Voting Classifier with Logistic Regression , SVM and Random Forest to get a better model
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=True)
cv=CountVectorizer()
x_train=cv.fit_transform(X_train)
x_test = cv.transform(X_test)
vc=VotingClassifier(estimators=[('logreg_best',logreg_best),('svc',svc),('nb',nb)],voting='hard')
vc.fit(x_train,y_train)
y_pred_vc=vc.predict(x_test)

metric_calculation(y_test,y_pred_vc)

Accuracy  :   0.9620325435341136
Precision :   0.9799613288802953
Recall    :   0.9734590536057273
F1-score  :   0.9766993693062369
Confusion matrix      : 
  [[1165  114]
 [ 152 5575]]
Classification report : 
               precision    recall  f1-score   support

         0.0       0.88      0.91      0.90      1279
         1.0       0.98      0.97      0.98      5727

    accuracy                           0.96      7006
   macro avg       0.93      0.94      0.94      7006
weighted avg       0.96      0.96      0.96      7006



**Testing with real-time data using Twitter API**

In [46]:
# Authenticating Twitter API
consumer_key = 'HZRzp3wipiTD7mPuHb6TbayFO'
consumer_secret = 'vfsSVritLxSC7traklojC7knAi4qGDPzrFsbCsAzCqfEyeNXRH'
access_token = '1593233208648224768-CUfwIsUxxZhZD1mKzRvPIn7fSmar99'
access_token_secret = 'stK9CfoU21OEsF3T2hwO7DnelvxCpNWxhe1tq2CAB7ScJ'

auth=tweepy.OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_token,access_token_secret)
api=tweepy.API(auth)

In [47]:
# Setting the location as London
closest_loc = api.trends_closest(51.5072, 0.1276)
closest_loc[0]

{'name': 'London',
 'placeType': {'code': 7, 'name': 'Town'},
 'url': 'http://where.yahooapis.com/v1/place/44418',
 'parentid': 23424975,
 'country': 'United Kingdom',
 'woeid': 44418,
 'countryCode': 'GB'}

In [66]:
# Getting the trending hashtags from London location
l=[]
trends = api.trends_place(closest_loc[0]["woeid"])

for i in range(47): 
  s=trends[0]["trends"][i]["name"]
  if s[0]=='#':
    print(s)
    l.append(s)

#r4today
#NursesStrike
#santaliveread
#NHSWorkersNeedAPayrise
#HarryandMeganNetflix
#thursdayvibes
#12DaysOfTechmas
#EnoughIsEnough
#SafeStaffingSavesLives
#SupportOurNHSWorkers


In [79]:
# getting the recent tweets on the top 3 trending hashtags
k=[]
main=[]
for j in range(3):
    for tweet in tweepy.Cursor(api.search, q=l[j]).items(15):
        print(tweet.text)
        k.append(tweet.text)
        
        print("___________________")
  

RT @feistywomankent: A reminder from June 2020 - claps don’t pay the supermarket bills. #SupportTheNurses #FairPayforNursing #NursesStrike…
___________________
RT @Hepworthclare: Disgraceful behaviour from Nick Robinson #r4today interviewing the RCN General Secretary.
No forensic probing to establi…
___________________
RT @rjcruthers: Woken by Nick Robinson (annual salary £270,000 - £274,999) getting hot under the collar about nurses asking for more during…
___________________
̗#Motivation 👍

#acoty2022 #netizensreport #iranrevolution #trading #exchange #kids #r4today #80s https://t.co/HbQ7wYUeh1
___________________
RT @ApeBaffled: Tory PR man Nick Robinson on £275k quoting Sunak's 'Independent, Independent, Independent' on #r4today re #NursesStrike who…
___________________
RT @AlecHitchman1: Economics 

Income £20.0
Outgoings £19.6d
Outcome happiness. 

Income £20.0
Outgoings £20.6d
Outcome misery. 

Dickens k…
___________________
RT @The_ChrisShaw: Strong interview on @BBCr4today wit

In [50]:
#dir(api)

In [80]:
# Convering the twitter data to a Dataframe
df = pd.DataFrame(k)
df.columns=['Tweet']
df.head()

Unnamed: 0,Tweet
0,RT @feistywomankent: A reminder from June 2020...
1,RT @Hepworthclare: Disgraceful behaviour from ...
2,RT @rjcruthers: Woken by Nick Robinson (annual...
3,̗#Motivation 👍\n\n#acoty2022 #netizensreport #...
4,RT @ApeBaffled: Tory PR man Nick Robinson on £...


In [81]:
# Removing the unnecessary characters from the data received from twitter
df['Tweet'] = df['Tweet'].str.replace('RT', '1111', regex=True)
df['Tweet'] = df['Tweet'].str.replace(' ', '1111', regex=True)
df['Tweet'] = df['Tweet'].str.replace('\W', '', regex=True)
df['Tweet'] = df['Tweet'].str.replace('1111', ' ', regex=True)
res = [idx for idx in df['Tweet'] if not re.findall("[^\u0000-\u05C0\u2100-\u214F]+", idx)]

res=pd.DataFrame(res)
res.columns=['Tweet']

In [82]:
# Making a copy of the dataset for future use
copy=res

In [83]:
# Applying the same  pre-processing on the twitter data as on the training dataset
res['Tweet'] = res['Tweet'].str.lower()
res['Tweet'] = res['Tweet'].apply(removeSpace)
res['Tweet'] = res['Tweet'].apply(lambda x: word_tokenize(x))
# res['tweet'] = res['tweet'].apply(rectifySpelling)
res['Tweet'] = res['Tweet'].apply(removeWords)
res['Tweet'] = res['Tweet'].apply(removePunctuation)
#wordFreq = freqWords(res)
#res['Tweet'] = res['Tweet'].apply(removeFreqWord)
res['Tweet'] = res['Tweet'].apply(lemmatize)
res['Tweet'] = res['Tweet'].apply(removeTag)
res['Tweet'] = res['Tweet'].str.replace(r'http', '')

In [84]:
# Transforming a text into a vector on the basis of the frequency of each word
vect=res["Tweet"]
x_cv=cv.fit_transform(x)
test_df = cv.transform(vect)

In [85]:
res["Tweet"]

0     feistywomankent reminder june 2020 clap dont p...
1     hepworthclare disgraceful behaviour nick robin...
2     rjcruthers wake nick robinson annual salary 27...
3     motivation acoty2022 netizensreport iranrevolu...
4     apebaffled tory pr man nick robinson 275k quot...
5     alechitchman 1economics income 200outgoings 19...
6     the_chrisshaw strong interview bbcr4today rise...
7     feistywomankent reminder june 2020 clap dont p...
8     rjcruthers wake nick robinson annual salary 27...
9     zoejardiniere story r4today misinform public r...
10    candistore mick lynch speak amp woman like fee...
11    feistywomankent reminder june 2020 clap dont p...
12    tonydowling stand solidarity thercn nurse stri...
13    tonypitchford perhaps nick robinson could save...
14    stand_for_all year government push false infor...
15    thesarahdriver nurse strike put patient risk w...
16    themingford 12 year tory failure greed theft c...
17    today cant join nursesstrike trust didnt m

In [86]:
# Checking the shape of the training and testing dataset
print(x_cv.shape)
print(y.shape)
print(test_df.shape)

(23353, 31651)
(23353,)
(44, 31651)


In [74]:
# Training the entire dataset using Voting Classifier
vc.fit(x_cv,y)

VotingClassifier(estimators=[('logreg_best', LogisticRegression()),
                             ('svc', SVC()), ('nb', MultinomialNB())])

In [87]:
# Predicting on the twitter dataset
y_pred_tvc = vc.predict(test_df)
df_y_pred=pd.DataFrame(y_pred_tvc)

In [88]:
# Concatenating the predicted values with the original twitter data for better visualization
result = pd.concat([copy['Tweet'], df_y_pred], axis=1)
result.columns=['Tweet','Label']
result

Unnamed: 0,Tweet,Label
0,feistywomankent reminder june 2020 clap dont p...,0.0
1,hepworthclare disgraceful behaviour nick robin...,0.0
2,rjcruthers wake nick robinson annual salary 27...,0.0
3,motivation acoty2022 netizensreport iranrevolu...,0.0
4,apebaffled tory pr man nick robinson 275k quot...,0.0
5,alechitchman 1economics income 200outgoings 19...,0.0
6,the_chrisshaw strong interview bbcr4today rise...,0.0
7,feistywomankent reminder june 2020 clap dont p...,0.0
8,rjcruthers wake nick robinson annual salary 27...,0.0
9,zoejardiniere story r4today misinform public r...,0.0


In [95]:
import numpy as np
a=["This is a sunny morning",
"Shut the fuck up",
"Bonjour",
"You have a very ugly face.",
"Dont be a jerk",
"You asshole",
"Happy morning",
"Bloody bastard",
"Astala vista"]
data = np.array(a)
 

# a=pd.DataFrame(a)
test_df = cv.transform(a)
y_pred_t = vc.predict(test_df)

In [96]:
a1=pd.DataFrame(a)
y_pred_t=pd.DataFrame(y_pred_t)
result1 = pd.concat([a1, y_pred_t], axis=1)
result1

Unnamed: 0,0,0.1
0,This is a sunny morning,0.0
1,Shut the fuck up,1.0
2,Bonjour,0.0
3,You have a very ugly face.,1.0
4,Dont be a jerk,0.0
5,You asshole,1.0
6,Happy morning,0.0
7,Bloody bastard,1.0
8,Astala vista,0.0


In [93]:
result['Tweet'][32]

'ffordddyffryn dosbarth siabod get ready live feed santa north pole christmasclassic santaliveread stc'

In [94]:
result['Tweet'][26]

'themingford never forget loved one needlessly lose ghoul picture nurse love riski'

In [64]:
# for tweeti in tweepy.Cursor(api.search, q='#RIPTwitch').items(20):
#         print(tweeti.text)
#         print("_____")

In [65]:
# cursor = tweepy.Cursor(api.user_timeline,id='@sbravo1999',tweet_mode="extended").items()
# for i in cursor:
#   print(i.full_text)