## Usual imports

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pprint as pp
import re

# Getting an idea of the data

### **Report**

- 41157 tweets labeled

 - *Positive*              11422 (0.27)
 - *Negative*              9917 (0.24)
 - *Neutral*               7713 (0.18)
 - *Extremely Positive*    6624 (0.16)
 - *Extremely Negative*    5481 (0.13)

 
- *'UserName'and 'ScreenName'* columns are just coded values to preserve anonymity. We can drop them.

### First hypothesis (H0)

- Tweets mentions don't help to classify sentiment
- Tweets hashtags don't help to classify sentiment

In [None]:
trainset = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='latin-1')
testset = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding='latin-1')

In [None]:
trainset.head()

In [None]:
trainset.info()

Percentage of each Sentiment label in the train set

In [None]:
trainset['Sentiment'].value_counts(normalize=True)

In [None]:
sns.barplot(trainset.Sentiment.unique(), trainset.Sentiment.value_counts())

Top 25 values for the 'Location' column

 - As we can see, certain locations are repeated. We could maybe use NER to group locations

In [None]:
trainset['Location'].value_counts()[:25]

Creating new features on the dataset : mentions, hashtags, urls (and their count) / Dropping of the 'UserName' and 'ScreenName' columns

In [None]:
hashtag_regex = re.compile('(#[A-Z]\w+)')
test = '#Covid#Iran and #Trump'
group = hashtag_regex.findall(test)
for elt in group:
    print(elt)
    
mention_regex = re.compile('(@[A-Z]\w+)')
test = "@Trump's etc @ABC@Obama"
mention_regex.findall(test)

In [None]:
mention_regex = re.compile('(@[A-Z]\w+)')
hashtag_regex = re.compile('(#[A-Z]\w+)')

eda_data = trainset.drop(['UserName', 'ScreenName'], axis=1)
eda_data['TweetAt'] = pd.to_datetime(eda_data['TweetAt'])
# eda_data['mentions'] = pd.Series([[word for word in tweet.split() if word.startswith('@')] for tweet in eda_data['OriginalTweet'].values])
# eda_data['hashtags'] = pd.Series([[hashtag_regex.search(word).group() for word in tweet.split() if word.startswith('#')] for tweet in eda_data['OriginalTweet'].values])
eda_data['mentions'] = pd.Series(mention_regex.findall(tweet) for tweet in eda_data['OriginalTweet'].values)
eda_data['hashtags'] = pd.Series(hashtag_regex.findall(tweet) for tweet in eda_data['OriginalTweet'].values)
eda_data['urls'] = pd.Series([[url for url in tweet.split() if 'http' in url] for tweet in eda_data['OriginalTweet'].values])
eda_data['hashtags_count'] = pd.Series([float(len(_)) for _ in eda_data['hashtags']])
eda_data['mentions_count'] = pd.Series([float(len(_)) for _ in eda_data['mentions']])
eda_data['urls_count'] = pd.Series([float(len(_)) for _ in eda_data['urls']])
eda_data['cleaned_tweet'] = pd.Series([tweet.replace('\r', '').replace('\n', '') for tweet in eda_data['OriginalTweet'].values])
eda_data['tweet_length'] = pd.Series([len(tweet) for tweet in eda_data['cleaned_tweet'].values])
eda_data

In [None]:
pos = eda_data[eda_data['Sentiment'] == 'Positive']
ex_pos = eda_data[eda_data['Sentiment'] == 'Extremely Positive']
neg = eda_data[eda_data['Sentiment'] == 'Negative']
ex_neg = eda_data[eda_data['Sentiment'] == 'Extremely Negative']
neutral = eda_data[eda_data['Sentiment'] == 'Neutral']
dfs = [pos,ex_pos, neg, ex_neg, neutral]

In [None]:
for col in ['mentions_count', 'hashtags_count', 'urls_count']:
    plt.figure()
    sns.barplot(eda_data['Sentiment'], eda_data[col])

**Note** : Extremely negative sentiment tweets systematicaly has less mentions / hashtags / urls

In [None]:
dict = {}
for df in dfs:
    sent = df['Sentiment'].values[0]
    dict[sent] = {}
    for col in ['mentions', 'hashtags']:
        dict[sent][col] = {}
        for values in df[col]:
            for value in values:
                dict[sent][col][value] = dict[sent][col].get(value, 0) + 1

In [None]:
for sent in dict.keys():
    for col in dict[sent].keys():
        dict[sent][col] = sorted(dict[sent][col].items(), key=lambda x: x[1], reverse=True)

In [None]:
for sent in dict.keys():
    for col in dict[sent].keys():
       print('Top 10', col, 'for ', sent, ':', dict[sent][col][:10], '\n')