# Importing necessary Libraries

In [1]:
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score
import os


from tqdm import tqdm
from nltk.sentiment.vader import SentimentIntensityAnalyzer  

# Reading Twitter Customer support Data

In [2]:
d=pd.read_csv(r"C:\Users\Admin\OneDrive\Documents\Data Science\G-AI\Twitter_data_processing\twcs.csv\twcs.csv")

In [3]:
d.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [4]:
# Lower casing te text column

d['clean_text']=d['text'].str.lower()

In [5]:
import re

In [6]:
def html_removal_tags(text):
  pattern=re.compile('<.*?>')
  return pattern.sub("",text)

In [7]:
d["clean_text"]=d["clean_text"].apply(html_removal_tags)

In [8]:
def remove_url(text):
    pattern=re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub("",text)

In [9]:
d['clean_text']=d['clean_text'].apply(remove_url)

In [10]:
import string

In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
exclude=set(string.punctuation)
exclude

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~'}

In [13]:
def remove_punc(text):
    # Remove punctuation characters
    text_no_punc = ''.join(char for char in text if char not in exclude)
    return text_no_punc

In [14]:
d['clean_text']=d['clean_text'].apply(remove_punc)

In [15]:
# spell correction

import textblob

In [16]:
chat_words={
   " AFAIK":"As Far As I Know",
"AFK": "Away From Keyboard",
"ASAP":"As Soon As Possible",
"BTW":"By The Way",
"B4":"Before",
"LAMO":"Laugh My A.. Off",
"FYI":"For your information"
}

def chat_conversion(text):
    new_text=[]
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [17]:
d['clean_text']=d['clean_text'].apply(chat_conversion)

In [18]:
from textblob import TextBlob

def spell_correction(text):
  obj=TextBlob(text)
  return obj.correct().string

In [19]:
# d['clean_text']=d['clean_text'].apply(spell_correction)

In [20]:
import nltk
from nltk.corpus import stopwords

# def remove_stopwords(text):
#     new_text=[]

#     for word in text.split():
#         if word in stopwords.words("english"):
#             new_text.append("")
#         else:
#             new_text.append(word.strip())


#     return " ".join(new_text).replace("   ","")

In [21]:
# Load stopwords once
stop_words = set(stopwords.words("english"))

def remove_stopwords_(text):
    # Split the text into words
    words = text.split()
    
    # Remove stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Join the filtered words back into a string
    filtered_text = " ".join(filtered_words)
    
    return filtered_text

In [22]:
d['clean_text']=d['clean_text'].apply(remove_stopwords_)

In [23]:
# pip install emoji

In [24]:
import emoji

def remove_emoji(text):
  clear_txt=emoji.demojize(text)
  return clear_txt

In [25]:
# d['clean_text']=d['clean_text'].apply(remove_emoji)

In [26]:
d.shape

(2811774, 8)

In [27]:
d.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,clean_text
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0,115712 understand would like assist would need...
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0,sprintcare propose
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0,sprintcare sent several private messages one r...
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0,115712 please send us private message assist c...
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0,sprintcare


In [28]:
# deltetig the numbers from the text

def remove_numbers(text):
  removed_words=re.sub(r'\d+','',text)
  return removed_words

In [29]:
# d['clean_text']=d['clean_text'].apply(remove_numbers)

In [30]:
d.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,clean_text
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0,115712 understand would like assist would need...
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0,sprintcare propose
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0,sprintcare sent several private messages one r...
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0,115712 please send us private message assist c...
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0,sprintcare


In [31]:
# Lemmatization

from nltk.stem import WordNetLemmatizer

def lemmatization(text):
    words=text.split()

    lemmetizer=WordNetLemmatizer()

    lemetized_word=[lemmetizer.lemmatize(word) for word in words]
    
    return " ".join(lemetized_word)

In [32]:
d['clean_text']=d['clean_text'].apply(lemmatization)

In [33]:
d.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,clean_text
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0,115712 understand would like assist would need...
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0,sprintcare propose
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0,sprintcare sent several private message one re...
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0,115712 please send u private message assist cl...
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0,sprintcare


In [34]:
d1=d.copy() # demonstrate word tokenization
d2=d.copy() # demonstrate sentence tokenization
d3=d.copy() # demonstration of word2vec
d4=d.copy()

# dataframe d demonstrates count vectorizer without tokenization

### Let us use d4 dataframe for word2vec then sentiment analysis

In [35]:
# tokenization can be done using split() OR using regular expressions or using NLTK
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize

In [36]:
# def wd_tk(text):
#     t_text=word_tokenize(text)
#     return t_text


# d1['text']=d1['text'].apply(wd_tk)


In [37]:
# def sen_tk(text):
#     t_text=word_tokenize(text)
#     return t_text


# d2['text']=d2['text'].apply(sen_tk)


In [38]:
# encoding using count vectorizer

from sklearn.feature_extraction.text import CountVectorizer

In [39]:
BOW=CountVectorizer()

In [40]:
# document_matrix=BOW.fit_transform(d['text'])

In [41]:
# document_matrix[0].toarray()

In [42]:
# BOW.vocabulary_

In [43]:
# document_matrix[1].toarray()

In [44]:
bigram=CountVectorizer(ngram_range=(2,2))

In [45]:
# bigram_doc=bigram.fit_transform(d['text'])

In [46]:
# bigram.vocabulary_

In [47]:
# trigram=CountVectorizer(ngram_range=(3,3))

In [48]:
# trigramdata=trigram.fit_transform(d["text"])

In [49]:
# trigram.vocabulary_

In [50]:
# mix=CountVectorizer(ngram_range=(1,2))
# mix_vocab=mix.fit_transform(d["text"])

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [52]:
tfidf=TfidfVectorizer()

In [53]:
# t=tfidf.fit_transform(d["text"])

In [54]:
# t[0].toarray()

In [55]:
# tfidf.get_feature_names_out()  # vocabulary

In [56]:
# len(tfidf.get_feature_names_out())

In [57]:
d4.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,clean_text
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0,115712 understand would like assist would need...
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0,sprintcare propose
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0,sprintcare sent several private message one re...
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0,115712 please send u private message assist cl...
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0,sprintcare


In [58]:
def wd_tk(text):
    #t_text=word_tokenize(text)
    word=text.split()
    return word


d4['clean_text']=d4['clean_text'].apply(wd_tk)

In [59]:
d5=d4[['text','clean_text','inbound']].copy()

In [60]:
d5.head()

Unnamed: 0,text,clean_text,inbound
0,@115712 I understand. I would like to assist y...,"[115712, understand, would, like, assist, woul...",False
1,@sprintcare and how do you propose we do that,"[sprintcare, propose]",True
2,@sprintcare I have sent several private messag...,"[sprintcare, sent, several, private, message, ...",True
3,@115712 Please send us a Private Message so th...,"[115712, please, send, u, private, message, as...",False
4,@sprintcare I did.,[sprintcare],True


# Sentiment Intensity Analyser

In [61]:
# from nltk.sentiment.vader import SentimentIntensityAnalyzer

# # Download VADER lexicon if not already downloaded
# nltk.download('vader_lexicon')

In [62]:
# def unlist(lst):
#     words = ''
#     for item in lst:
#         words += item + ' '
#     return words

In [63]:

# def compute_vader_scores(df, label):
#     sid = SentimentIntensityAnalyzer()
#     df["vader_neg"] = df[label].apply(lambda x: sid.polarity_scores(unlist(x))["neg"])
#     df["vader_neu"] = df[label].apply(lambda x: sid.polarity_scores(unlist(x))["neu"])
#     df["vader_pos"] = df[label].apply(lambda x: sid.polarity_scores(unlist(x))["pos"])
#     df["vader_comp"] = df[label].apply(lambda x: sid.polarity_scores(unlist(x))["compound"])
#     df['cleantext2'] = df[label].apply(lambda x: unlist(x))
#     return df

In [64]:
# df2 = compute_vader_scores(d5,'clean_text')

In [65]:
# Objective: Let us make a prediction 
# Whether or not the tweet was sent (inbound) to a company

In [66]:
label_encoder = LabelEncoder()
d5['label'] = label_encoder.fit_transform(d5['inbound'])

In [67]:
d5.head()

Unnamed: 0,text,clean_text,inbound,label
0,@115712 I understand. I would like to assist y...,"[115712, understand, would, like, assist, woul...",False,0
1,@sprintcare and how do you propose we do that,"[sprintcare, propose]",True,1
2,@sprintcare I have sent several private messag...,"[sprintcare, sent, several, private, message, ...",True,1
3,@115712 Please send us a Private Message so th...,"[115712, please, send, u, private, message, as...",False,0
4,@sprintcare I did.,[sprintcare],True,1


In [68]:
# Split dataset into train and test sets
train_data, test_data = train_test_split(d5, test_size=0.2, random_state=42)

In [69]:
train_data.head()

Unnamed: 0,text,clean_text,inbound,label
1700581,@Ask_Spectrum is there a way to find out if Sp...,"[askspectrum, way, find, spectrum, area]",True,1
2221907,@687493 Please follow/DM your service phone nu...,"[687493, please, followdm, service, phone, num...",False,0
2594626,@125713 Thank you for reaching out. We appreci...,"[125713, thank, reaching, appreciate, concern,...",False,0
869805,"@349596 Thanks for following up, Tommy. Please...","[349596, thanks, following, tommy, please, sen...",False,0
1463083,@sprintcare the lady called yesterday and got ...,"[sprintcare, lady, called, yesterday, got, not...",True,1


In [70]:
preprocessed_tweets=train_data['clean_text'].to_list()
word2vec_model = Word2Vec(sentences=preprocessed_tweets, vector_size=150, window=5, min_count=2)

In [71]:
def create_tweet_embedding(tweet_tokens, model):
    # Initialize an empty array for the embedding
    embedding = np.zeros((model.vector_size,), dtype=np.float32)
    # Count of words in the tweet
    word_count = 0
    # Iterate over each word token in the tweet
    for token in tweet_tokens:
        # If the token is in the vocabulary of the model
        if token in model.wv:
            # Add the token's embedding to the total
            embedding += model.wv[token]
            word_count += 1
    # Average the embedding by the number of words
    if word_count != 0:
        embedding /= word_count
    return embedding

In [72]:
train_data['clean_text2'] = train_data['clean_text'].apply(lambda x: ' '.join(x))
test_data['clean_text2'] = test_data['clean_text'].apply(lambda x: ' '.join(x))

In [73]:
# Create embeddings for train and test data
train_embeddings = np.array([create_tweet_embedding(tweet, word2vec_model) for tweet in train_data['clean_text2']])
test_embeddings = np.array([create_tweet_embedding(tweet, word2vec_model) for tweet in test_data['clean_text2']])

# Naive Bayes Classifier

In [74]:
gnb = GaussianNB()
gnb.fit(train_embeddings,train_data['label'])

GaussianNB()

In [75]:
# Predict on test data
predictions = gnb.predict(test_embeddings)

In [78]:
predictions

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [76]:
# Decode label predictions
predictions1 = label_encoder.inverse_transform(predictions)

In [77]:
from sklearn.metrics import classification_report

In [80]:
print(classification_report(test_data['label'],predictions))

              precision    recall  f1-score   support

           0       0.70      0.89      0.78    254749
           1       0.88      0.68      0.77    307606

    accuracy                           0.78    562355
   macro avg       0.79      0.79      0.78    562355
weighted avg       0.80      0.78      0.78    562355



# Random Forest Classifer

In [78]:
# Train a classifier (Random Forest as an example)
# clf = RandomForestClassifier()
# clf.fit(train_embeddings, train_data['label'])

# # Predict on test data
# predictions_rf = clf.predict(test_embeddings)

# # Decode label predictions
# predictions_rf_1 = label_encoder.inverse_transform(predictions_rf)

In [None]:
# classification_report(test_data['label'],predictions_rf)