# GRAD-E1326: Python Programming for Data Scientists
## Ph.D. Hannah Béchara
### Ji Yoon Han & Mariana G. Carrillo 

**Initial Project report: Tweet Sentiment Analysis**


In [None]:
#Importing libraries for sentiment analysis 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk #Natural Language Processing Package 
import os #functions for interacting with the operating system
import spacy #Models for NLP
import torch #also for NLP
from tqdm.notebook import tqdm 
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
import transformers #contains pretrained models to perform tasks on texts
from transformers import BertForSequenceClassification
from wordcloud import WordCloud #For nice wordclouds
import tensorflow as tf #Package to develop train models 
from tensorflow.keras.preprocessing import text 
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
import time #for handling dates and times
import re 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.metrics import AUC
from sklearn.metrics import confusion_matrix, classification_report
from nltk.tokenize import TweetTokenizer


Loading data

In [None]:
#Loading and cleaning data
train_data = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='latin-1')
test_data = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding='latin-1')

In [None]:
#Train data 
#preview
train_data.head(5)


In [None]:
#descriptive statistics
train_data.describe()

In [None]:
#Test data
#preview
test_data.head(5)




Data pre-processing

In [None]:
#descriptive statistics
test_data.describe()

### Exploratory Data analysis

In [None]:


#Create histogram --> Distribution TEST DATA
#Can we also make this a function?
plt.figure(figsize=(12,6)) #specifying the size of the figure
sns.set_palette("Spectral") #color palette
sns.countplot(x='Sentiment', data=test_data, order=['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive'], )
plt.xlabel('Sentiment(tag)')
plt.ylabel('Count of tweets')
plt.suptitle('Histogram of tweet distribution per sentiment classification (Test data)')


In [None]:
#Create histogram --> Distribution TRAIN DATA
plt.figure(figsize=(12,6))
sns.set_palette("Spectral")
sns.countplot(x='Sentiment', data=train_data, order=['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive'], )
plt.xlabel('Sentiment (tag)')
plt.ylabel('Count of tweets')
plt.suptitle('Histogram of tweet distribution per sentiment classification (Train data)')

In [None]:
#Distribution of tweet counts --> TEST DATA
test_data.groupby(['TweetAt', 'Sentiment'])['OriginalTweet'].count().unstack().plot(kind='line', figsize=(12, 6))
plt.title('Tweets on Coronavirus March 2020 (Test data)')
plt.ylabel('Tweet Count')

In [None]:
#Distribution of tweet counts --> TRAIN DATA 
train_data.groupby(['TweetAt', 'Sentiment'])['OriginalTweet'].count().unstack().plot(kind='line', figsize=(12, 6))
plt.title('Tweets on Coronavirus, March 2020, (Train data)')
plt.ylabel('Tweet Count')

### Data cleaning

In [None]:
# Import nltk / stopwords
import nltk
nltk.download('stopwords')

# Define stopwords 
stop_words = stopwords.words('english') #defining var to remove stopwords in the process_tweet function 

# Define function for cleaning tweets 
def clean_tweet(tweet):
    tweet = re.sub(r'http\S+', ' ', tweet) #removing urls
    tweet = re.sub(r'<.*?>', ' ', tweet)  # removing html tags    
    tweet = re.sub(r'\d+', ' ', tweet) #removing digits
    tweet = re.sub(r'#\w+', ' ', tweet)    #removing hashtags
    tweet = re.sub(r'@\w+', ' ', tweet) #removing mentions
    tweet = tweet.split() #removing stop words
    tweet = " ".join([word for word in tweet if not word in stop_words])
    return tweet

train_data['CleanTweet'] = train_data['OriginalTweet'].apply(lambda x: clean_tweet(x))
train_data.head(10)

# Clean tweets from test data by creating a new column in the test_data df
test_data['CleanTweet'] = test_data['OriginalTweet'].apply(lambda x: clean_tweet(x))
test_data.head(10)

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

#Attempt to create wordcloud
def wordcloud1(training_data):
    stopwords = set(STOPWORDS)
    stopwords.add("https")
    stopwords.add("00A0")
    stopwords.add("00BD")
    stopwords.add("00B8")
    stopwords.add("ed")
    wordcloud1 = WordCloud(background_color="white",stopwords=stopwords).generate(" ".join([i for i in train_data['OriginalTweet'].str.upper()]))
    plt.imshow(wordcloud1)
    plt.axis("off")
    plt.title("Most common words, training data")
    figsize=(12, 6)

wordcloud1(train_data)  

In [None]:
#Attempt to create wordcloud - test data
def wordcloud2(test_data):
    stopwords = set(STOPWORDS)
    stopwords.add("https")
    stopwords.add("00A0")
    stopwords.add("00BD")
    stopwords.add("00B8")
    stopwords.add("ed")
    wordcloud2 = WordCloud(background_color="white",stopwords=stopwords).generate(" ".join([i for i in test_data['OriginalTweet'].str.upper()]))
    plt.imshow(wordcloud2)
    plt.axis("off")
    plt.title("Most common words, training data")
    figsize=(12, 6)

wordcloud2(test_data)  

### Tokenization

In [None]:
## tweet tokenizer 

import nltk
nltk.download('punkt')

from nltk.tokenize import TweetTokenizer 

compare_list = train_data['CleanTweet'].head(10)

## need to add code to clean test_data

tweet_tokenizer = TweetTokenizer()

tweet_tokens = []
for sent in compare_list:
    print(tweet_tokenizer.tokenize(sent))
    tweet_tokens.append(tweet_tokenizer.tokenize(sent))


In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer 
import pandas as pd

#instantiate CountVectorizer() 
vectoriser = CountVectorizer()

#Generate vectors

X_test = vectoriser.transform(test_data["CleanTweet"])
y_test = encoder.transform(test_data["Sentiment"])

 
# this steps generates word counts for the words in your docs 
Xtest_wcount = cv.fit_transform(X_test)
ytest_wcount = cv.fit_transform(y_test)

**TF-IDF Model - Ji Yoon**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import words

tv = TfidfVectorizer(
                    ngram_range = (1,3),
                    sublinear_tf = True,
                    max_features = 40000)

xtrain_bow, xvalid_bow, ytrain, yvalid = train_data(train_data, train_data['CleanTweet'], random_state=42, test_size=0.3)

train_tv= tv.fit_transform(train_data['CleanTweet'])
test_tv= tv.fit_transform(test_data['CleanTweet'])

train_tfidf = train_tv[:31962,:]
test_tfidf = test_tv[31962:,:]

xtrain_tfidf = train_tfidf[ytrain.index]
xvalid_tfidf = train_tfidf[yvalid.index]

lreg.fit(xtrain_tfidf, ytrain)

prediction = lreg.predict_proba(xvalid_tfidf)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)

f1_score(yvalid, prediction_int)

 

### References 
* Matplotlib.org. 2020. Pyplot Tutorial — Matplotlib 3.3.2 Documentation. [online] Available at: <https://matplotlib.org/tutorials/introductory/pyplot.html> [Accessed 20 October 2020].
* Kaggle.com. 2020. Sentiment Prediction. [online] Available at: <https://www.kaggle.com/shahraizanwar/covid19-tweets-sentiment-prediction-rnn-85-acc> [Accessed 18 October 2020].
* 