In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
sns.set_style("whitegrid")

#for text pre-processing
import re, string
import nltk
from textblob import TextBlob #spelling correction
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec

import time

%matplotlib inline

pd.set_option('display.max_columns', None)

import warnings

warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [11]:
# function for tweet pre-processing

# function for text cleaning
def clean(text):
    text = text.replace('RT', ' ') #get rid of RTs
    text = re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)','',text)
    text = text.lower().strip() #removes upper case and leading and trailing whitespaces
    text = re.compile('<.*?>').sub('', text) #removing inside of tags <>
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text) #strip punctuation from string
    text = re.compile('https?://S+|www.S+').sub('', text) #removing URLs
    text = re.sub('\s+', ' ', text) #replacing double space with single space
    text = re.sub(r'\[[0-9]*\]',' ', text)  #replacing any number by a single space
    text = re.sub(r'\d',' ', text) #get rid of decimal digits
    text = re.sub(r'\s+',' ', text) #get rid of duplicate whitespaces

    return text

# function for spelling correction
def correct_spelling(text):
    text = str(TextBlob(text))
    return text

# function for removing emojis
def termninemoji(text):
    no_emo = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F" 
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002500-\U00002BEF"
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                           "]+", flags = re.UNICODE).sub(r'', text)
    return no_emo

# function for Stop Word removal
def stop(text):
    stop = stopwords.words('english')
    no_stop_words = [word for word in text.split() if word not in stop]
    return ' '.join(no_stop_words)

# function text pre-processing

def preprocess(text):
    text = clean(text)
    text = correct_spelling(text)
    text = termninemoji(text)
    text = stop(text)
    return text

# function for lemmatization

# getting the tags (word type: verb, noun, adverb etc.)
def get_type(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('DT'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatization(text):
    tokens = word_tokenize(text) #tokenizing
    word_and_tag = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(pair[0], pos=get_type(pair[1])) for pair in word_and_tag]
    return ' '.join(lemmatized)

# function for final pre-processing

def final_prep(text):
    text = clean(text)
    text = correct_spelling(text)
    text = termninemoji(text)
    text = stop(text)
    text = lemmatization(text)
    return text

In [12]:
data = pd.read_csv('final_df.csv')
data.head()

Unnamed: 0,entities,lang,text,conversation_id,created_at,possibly_sensitive,author_id,reply_settings,context_annotations,referenced_tweets,source,id,in_reply_to_user_id,attachments,geo,withheld,retweet_count,reply_count,like_count,quote_count
0,"{'mentions': [{'start': 3, 'end': 14, 'usernam...",en,RT @MlkzyHywad: @GOPChairwoman The only cause ...,1563562988434780160,2022-08-27T16:24:00.000Z,0.0,1483439745900462091,everyone,"[{'domain': {'id': '10', 'name': 'Person', 'de...","[{'type': 'retweeted', 'id': '1563551195914186...",Twitter for Android,1563562988434780160,,,,,4,0,0,0
1,,en,The true duality of man is living day-to-day i...,1563562986916028419,2022-08-27T16:24:00.000Z,0.0,550210137,everyone,,,Twitter for iPhone,1563562986916028419,,,,,0,0,0,0
2,"{'mentions': [{'start': 3, 'end': 19, 'usernam...",en,RT @musclesnnursing: People keep asking when h...,1563562986765033473,2022-08-27T16:24:00.000Z,0.0,22447783,everyone,,"[{'type': 'retweeted', 'id': '1563329867612954...",Twitter for iPhone,1563562986765033473,,,,,3,0,0,0
3,"{'mentions': [{'start': 3, 'end': 16, 'usernam...",en,RT @Logically_JC: If you are afraid of sociali...,1563562986513453061,2022-08-27T16:24:00.000Z,0.0,1504505694837522432,everyone,,"[{'type': 'retweeted', 'id': '1563522431578361...",Twitter Web App,1563562986513453061,,,,,444,0,0,0
4,,en,"what're you doin' tonight, hey, boy?\nset my a...",1563562986182127618,2022-08-27T16:24:00.000Z,0.0,1531878468882923520,everyone,,,"Cheap Bots, Done Quick!",1563562986182127618,,,,,0,0,0,0


In [13]:
data['prep_text'] = data['text'].apply(lambda x: final_prep(x))
data.head()

Unnamed: 0,entities,lang,text,conversation_id,created_at,possibly_sensitive,author_id,reply_settings,context_annotations,referenced_tweets,source,id,in_reply_to_user_id,attachments,geo,withheld,retweet_count,reply_count,like_count,quote_count,prep_text
0,"{'mentions': [{'start': 3, 'end': 14, 'usernam...",en,RT @MlkzyHywad: @GOPChairwoman The only cause ...,1563562988434780160,2022-08-27T16:24:00.000Z,0.0,1483439745900462091,everyone,"[{'domain': {'id': '10', 'name': 'Person', 'de...","[{'type': 'retweeted', 'id': '1563551195914186...",Twitter for Android,1563562988434780160,,,,,4,0,0,0,cause insecurity country america servantspanic...
1,,en,The true duality of man is living day-to-day i...,1563562986916028419,2022-08-27T16:24:00.000Z,0.0,550210137,everyone,,,Twitter for iPhone,1563562986916028419,,,,,0,0,0,0,true duality man live daytoday terror potentia...
2,"{'mentions': [{'start': 3, 'end': 19, 'usernam...",en,RT @musclesnnursing: People keep asking when h...,1563562986765033473,2022-08-27T16:24:00.000Z,0.0,22447783,everyone,,"[{'type': 'retweeted', 'id': '1563329867612954...",Twitter for iPhone,1563562986765033473,,,,,3,0,0,0,people keep ask he arrest forget panic phase c...
3,"{'mentions': [{'start': 3, 'end': 16, 'usernam...",en,RT @Logically_JC: If you are afraid of sociali...,1563562986513453061,2022-08-27T16:24:00.000Z,0.0,1504505694837522432,everyone,,"[{'type': 'retweeted', 'id': '1563522431578361...",Twitter Web App,1563562986513453061,,,,,444,0,0,0,jc afraid socialism love subsidy dont understa...
4,,en,"what're you doin' tonight, hey, boy?\nset my a...",1563562986182127618,2022-08-27T16:24:00.000Z,0.0,1531878468882923520,everyone,,,"Cheap Bots, Done Quick!",1563562986182127618,,,,,0,0,0,0,whatre doin tonight hey boyset alarm turn char...


In [14]:
data['prep_text']

0       cause insecurity country america servantspanic...
1       true duality man live daytoday terror potentia...
2       people keep ask he arrest forget panic phase c...
3       jc afraid socialism love subsidy dont understa...
4       whatre doin tonight hey boyset alarm turn char...
                              ...                        
9980    ssyb n mine im afraid drive couldnt afford sch...
9981    guttenberg hey would love visit office maybe v...
9982    people kaduna rally cancel elrufai liver ball ...
9983    desta abiy regime whose defining feature barba...
9984    k happiness set alarm next daysabka dil mein k...
Name: prep_text, Length: 9985, dtype: object