In [None]:
#List of Packages
import pandas as pd
import os
import glob
import re
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk import pos_tag, map_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
pstem = PorterStemmer()
lem = WordNetLemmatizer()
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,precision_score,recall_score

#Downloading packages
nltk.download('punkt') 
nltk.download('stopwords')
nltk.download('sentiwordnet')
nltk.download('wordnet')
!pip install wordcloud
!pip install -U scikit-learn scipy matplotlib

In [None]:
# Collecting list of csv files contains extracted tweets.
thisdir = os.getcwd()
currdir = os.chdir('D:/Study/Dissertation/Practical/Test')
dataset_path = os.listdir(currdir)
dataset_path

#Creating data_frame
covidtweet_df = pd.DataFrame()

#Loading tweets from csv into dataframe.
for excel_files in dataset_path:
    sheet = pd.read_csv(excel_files,encoding='utf-8')
    covidtweet_df = covidtweet_df.append(sheet)
    date = excel_files.split("_")[0] #Extracting date from excel filename and adding it into new column.
    covidtweet_df['Date'] = pd.to_datetime(date, format='%Y%m%d')  

#Getting struture of the dataframe.
covidtweet_df.info()

#viewing top 5 rows from the dataframe.
covidtweet_df.head(5)

In [None]:
#Checking for duplicates
duplicates = covidtweet_df.duplicated().tolist()
print(duplicates.count(True))
duplicate_tweets = covidtweet_df.duplicated(['text']).tolist()
print(duplicate_tweets.count(True))

#Dropping duplicates.
covidtweet_df = covidtweet_df.drop_duplicates(['text'])
covidtweet_df.shape[0]

# Checking null values & dropping it from location field
print(covidtweet_df['location'].isna().sum())

covidtweet_df = covidtweet_df.dropna(axis=0, subset=['location'])

#Dropping unwanted columns from dataframe.
covidtweet_df = covidtweet_df.drop(['hashtags','user','acctdesc','totaltweets','retweetcount'], axis = 1)
covidtweet_df.head(5)

#Renaming the column field for better naming conversion
covidtweet_df.rename(columns = {'Unnamed: 0':'S.NO','text':'Tweets','location':'Location'}, inplace = True)
print(covidtweet_df.columns)

In [None]:
#Importing county excel files to select specific countries.
county_excel = pd.read_csv('D:\Study\Dissertation\Practical\Counties_list\Counties.csv')
county_df = pd.DataFrame(county_excel)
county_df
covidtweet_df.Location=covidtweet_df.Location.astype(str)
county_df = county_df.astype(str)
county_df.head(4)

# Matching location field with county dataset for six countries such as USA, UK, Ireland, Australia, Italy and New Zealand.
for (columnName, columnData) in county_df.iteritems():
    for item in columnData:
        covidtweet_df.loc[covidtweet_df['Location'].str.contains(item, na=False, case=False, regex=True), 'Country'] = columnName

#Display bottom 5 rows of the dataframe
covidtweet_df.tail(5)

#Checking null values in country field
print(covidtweet_df['Country'].isna().sum())

# Dropping null values from Country field and dropping Location field
covidtweet_df = covidtweet_df.dropna(axis=0, subset=['Country'])
covidtweet_df = covidtweet_df.drop(['Location'], axis=1)

# Display bottom 5 rows after removing duplicates from country field
covidtweet_df.tail(5)

#One batch were completed and exporting the dataframe into separate csv files.
filename = 'coronavirus_tweets_sample_4.csv'
covidtweet_df.to_csv(filename)

In [None]:
# Merging the filter tweets csv files for six countries into single final csv file.
os.chdir('D:\Study\Dissertation\Practical\Merge_dataset')
file_endswith = 'csv'
Files = [x for x in glob.glob('*.{}'.format(file_endswith))]

Concat_all_csv_files = pd.concat([pd.read_csv(file) for file in Files])
final_covidtweet_df = pd.DataFrame()
final_covidtweet_df = final_covidtweet_df.append(Concat_all_csv_files)

#Displaying the top 5 rows.
final_covidtweet_df.head(5)

#Getting length of the dataframe
final_covidtweet_df.shape[0]

In [None]:
# Checking for duplicates in the row.
final_covidtweet_df.shape[0]
duplicates = final_covidtweet_df.duplicated().tolist()
print(duplicates.count(True))
duplicate_tweets = final_covidtweet_df.duplicated(['Tweets']).tolist()
print(duplicate_tweets.count(True))

#Remove duplicates from the dataframe.
final_covidtweet_df = final_covidtweet_df.drop_duplicates(['Tweets'])
final_covidtweet_df.shape[0]

# Printing tweet text to spot the unwanted string or special characters from the text.
print(final_covidtweet_df["Tweets"])


# Removing the unwanted string and special charcters using regex pattern.
def CleaningTweetsText(reqexClean):
    reqexClean = re.sub(r'@[A-Z0-9a-z_:]+','',reqexClean)
    reqexClean = re.sub(r'^[RT]+','',reqexClean)
    reqexClean = re.sub('https?://[A-Za-z0-9./]+','',reqexClean)
    reqexClean = re.sub("[^a-zA-Z]",' ',reqexClean)
    return reqexClean.lower()

#Calling the CleaningTweetsText function and storing the cleaned tweets in separtate column called CleanedTweetsText
final_covidtweet_df['Cleaned_Tweets'] = final_covidtweet_df['Tweets'].apply(lambda x: CleaningTweetsText(x))

#Printing the tweets text after removing unwanted string and special charcters
print(final_covidtweet_df["Cleaned_Tweets"])

In [None]:
#Implementing NLP techniques during data pre-processing.
for i in range(len(final_covidtweet_df.index)):
        tweet_text = final_covidtweet_df.iloc[i]['Cleaned_Tweets']
        word_tokens = nltk.word_tokenize(tweet_text)
        stop_words_removal = [w for w in word_tokens if not w in stop_words]
        word_pos_tag = pos_tag(word_tokens)
        tag_words_var = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in word_pos_tag]
        
#Printing the results after tokenzied.
print(word_tokens)

#Printing the results after pos-tagging.
print(tag_words_var)

#Printing the results after Stop words removal.
print( stop_words_removal)

In [None]:
 #Calculating sentiment_score
    def calculating_sentiment(final_covidtweet_sentiscore):
    col_wd_swn=[]
    col_pos_wd_swn=[]
    col_neg_wd_swn=[]
    mis_words=[]
    for i in range(len(final_covidtweet_sentiscore.index)):
        tweet_text = final_covidtweet_sentiscore.iloc[i]['Cleaned_Tweets']
        word_tokens = nltk.word_tokenize(tweet_text)
        tag_part_of_speech = pos_tag(word_tokens)
        tagged_words = [(wd, map_tag('en-ptb', 'universal', t)) for wd, t in tag_part_of_speech]

        total_positive_Score=0
        total_negative_Score=0
        for wd,tags in tagged_words:
            if(tags=='NOUN'):
                tags='n'
            elif(tags=='VERB'):
                tags='v'
            elif(tags=='ADJ'):
                tags='a'
            elif(tags=='ADV'):
                tags = 'r'
            else:
                tags='nothing'

            if(tags!='nothing'):
                join_tagged_words = wd+'.'+tags+'.01'
                try:
                    this_word_pos=swn.senti_synset(join_tagged_words).pos_score()
                    this_word_neg=swn.senti_synset(join_tagged_words).neg_score()
                except Exception as err:
                    lem_word = lem.lemmatize(wd)
                    join_tagged_words = lem_word+'.'+tags+'.01'
                    try:
                        this_word_pos=swn.senti_synset(join_tagged_words).pos_score()
                        this_word_neg=swn.senti_synset(join_tagged_words).neg_score()
                    except Exception as err:
                        stem_word = pstem.stem(wd)
                        join_tagged_words = stem_word+'.'+tags+'.01'
                        try:
                            this_word_pos=swn.senti_synset(join_tagged_words).pos_score()
                            this_word_neg=swn.senti_synset(join_tagged_words).neg_score()
                        except:
                            mis_words.append(wd)
                            continue
                total_positive_Score+=this_word_pos
                total_negative_Score+=this_word_neg
        col_pos_wd_swn.append(total_positive_Score)
        col_neg_wd_swn.append(total_negative_Score)

        if(total_positive_Score!=0 or total_negative_Score!=0):
            if(total_positive_Score>total_negative_Score):
                col_wd_swn.append(1)
            else:
                col_wd_swn.append(-1)
        else:
            col_wd_swn.append(0)
    final_covidtweet_sentiscore.insert(6,"positive_metrics",col_pos_wd_swn,True)
    final_covidtweet_sentiscore.insert(7,"negative_metrics",col_neg_wd_swn,True)
    final_covidtweet_sentiscore.insert(8,"sentiment_metrics",col_wd_swn,True)
    return final_covidtweet_sentiscore

#Creating a new dataframe called sentiment score
sentiment_dataframe = pd.DataFrame()

#Calling the caluclating sentiment function and storing the values into the dataframe
sentiment_dataframe = calculating_sentiment(final_covidtweet_df)

#Reomving the unwnated columns for research
sentiment_dataframe = sentiment_dataframe.drop(['Unnamed: 0', 'S.NO', 'Tweets'], axis=1)

#Printing length of the column.
print(len(sentiment_dataframe.columns))

In [None]:
# changing the category values to text from numeric.
def sentiment_category(x):
    if x == -1:
        return 'negative'
    elif x == 1:
        return 'positive'
    elif x == 0 :
        return 'neutral'

# Calling the function called sentiment_category and storing the values into sentiment category column
sentiment_dataframe['Sentiment_category'] = sentiment_dataframe['sentiment_metrics'].apply(lambda x: sentiment_category(x))

In [None]:
# Creating WordCloud for positive, negative and neutral tweets.
sentiment_dataframe['Tweet_without_stopwords'] = sentiment_dataframe['Cleaned_Tweets']
.apply(lambda x: ' '.join([wd for wd in x.split() if wd not in (stop_words)]))

#List of positive, negative and neutral tweets.
positive_words = []
negative_words = []
neutral_words = []

#Storing each category of tweets in separate list.
for x in range(len(sentiment_dataframe.index)):
    if(sentiment_dataframe.iloc[x]["Sentiment_category"]=='positive'):
        positive_words+=sentiment_dataframe.iloc[x]["Tweet_without_stopwords"]
    elif(sentiment_dataframe.iloc[x]["Sentiment_category"]=='negative'):
        negative_words+=sentiment_dataframe.iloc[x]["Tweet_without_stopwords"]
    else:
        neutral_words+=sentiment_dataframe.iloc[x]["Tweet_without_stopwords"]

list_category_words = [positive_words,negative_words,neutral_words]

# Word cloud visualization.
from wordcloud import WordCloud
%matplotlib inline
import matplotlib.pyplot as plt
for category_words in list_category_words:
    word_cloud = WordCloud(width = 600,height = 600,max_font_size = 200).generate(''.join(category_words))
    plt.figure(figsize=(12,10))
    plt.imshow(word_cloud,interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [None]:
# Save file for PowerBI_visualization.
filename = 'Tweet_sentiment_score_PowerBI_visualization.csv'
sentiment_dataframe.to_csv(filename)

In [None]:
#Building a model using LInearSVC and TF-IDFVectorizer
SEED = 4
x = sentiment_dataframe.Tweet_without_stopwords
y = sentiment_dataframe.sentiment_metrics
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=SEED)

#Coverting text data into numeric using TfidfVectorizer
Tfidvec = TfidfVectorizer(decode_error='ignore',lowercase=False,max_features=11)
x_train_copy = x_train
x_traintdf = Tfidvec.fit_transform(x_train_copy)

#Output after TfidfVectorizerss 
x_traintdf

#SVM classifier model
tdfvec = TfidfVectorizer(ngram_range=(1,2))
model = make_pipeline(tdfvec, LinearSVC())
model.fit(x_train,y_train)
predicted_results = model.predict(x_test.values)
accuracy_score = accuracy_score(y_test,predicted_results)
precision_score = precision_score(y_test,predicted_results)
recall_score = recall_score(y_test,predicted_results)
print(2," gram", "accuracy_score: " + str(accuracy_score), "precision_score: " + str(precision_score.mean()), "recall_score: " + str(recall_score.mean()))
print('Predicted_vales:',labels)
print('acutual_values:',y_test)

In [None]:
# Using NRC lexicon to detect emotion from tweet.
from collections import OrderedDict, defaultdict, Counter
import csv
dict_with_Word_emotion = defaultdict(list)
dict_with_emotion_word = defaultdict(list)
with open('D:\\Study\\Dissertation\\Practical\\NRC\\NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    headerRows = [i for i in range(0, 46)]
    for row in headerRows:
        next(reader)
    for word, emotion, present in reader:
        if int(present) == 1:
            dict_with_Word_emotion[word].append(emotion)
            dict_with_emotion_word[emotion].append(word)

# Getting count of emotions from the text.
def generate_emotion_count(string, tokenizer):
    emoCount = Counter()
    for token in tt.tokenize(string):
        token = token.lower()
        emoCount += Counter(dict_with_Word_emotion[token])
    return emoCount

# Collecting the cleaned tweets and tokenzing the tweets
tweets = sentiment_dataframe['Cleaned_Tweets']
tt = TweetTokenizer()

# Calculating emotion count using NRC lexicon
emotionCounts = [generate_emotion_count(twt, tt) for twt in tweets]

# Removing Null values from emotion_df dataframe.
emotion_df = pd.DataFrame(emotionCounts, index=tweets.index)
emotion_df = emotion_df.fillna(0)

In [None]:
# Creating a new columns in sentiment datframe which are in the emotion dataframe
sentiment_dataframe['trust'] = emotion_df['trust']
sentiment_dataframe['surprise'] = emotion_df['surprise']
sentiment_dataframe['joy'] = emotion_df['joy']
sentiment_dataframe['anger'] = emotion_df['anger']
sentiment_dataframe['sadness'] = emotion_df['sadness']
sentiment_dataframe['fear'] = emotion_df['fear']
sentiment_dataframe['anticipation'] = emotion_df['anticipation']
sentiment_dataframe['disgust'] = emotion_df['disgust']

sentiment_dataframe.head(5)

# Saving the data for powerBI visualization.
filename = 'Emotion_tweet_count.csv'
sentiment_dataframe.to_csv(filename)