In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# List of Contents:

[1. Input and EDA](#section1)

[2. Tokenization and Cleaning](#section2)

[3. Plotting Number of Tweets per day](#section3)

[4. Building a WordCloud](#section4)

[5. Logistic Regression](#section5)

[6. Topwords DataFrame](#section6)

[7. Scatterplot and Closing Thoughts](#section7)


# 1. Input and EDA
<a class="anchor" id="section1"></a>
We are given the following variables in the data:

* Tweet ID
* Content
* Location


The scope of the analysis here will be to join the twitter data from all 100 files and perform NLP techniques to discern **which words in the content correspond to a particular region.**

But first lets get started with some EDA on the overall dataset.

In [None]:
#Importing the required modules for EDA and ETL
import os
import glob #glob is a tool to index and list multiple files for convenient reading
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style

style.use('Solarize_Light2')
gl = glob.glob('../input/twitter-dateset-collected-during-covid19-pandemic/data_twitter/*.csv')
print('There are {} files total'.format(len(gl)))

The `glob` function can be used to aggregate various input files that can then be looped over in the system by means of a dictionary or list comprehension.



In [None]:
#Reading in and concatenating tweet data into a single dataframe
li = [pd.read_csv(t) for t in gl]
li = pd.concat(li)
li = li.iloc[:,1:4]
li.columns = ['time','content','location']

li.info()

---

# 2. Tokenization and Cleaning
<a class="anchor" id="section2"></a>

The `nltk` package offers a neat `TweetTokenizer` function to break up an induvidual tweet into tokens. 

From there we will rid the data of special and non alphanumeric characters to prep it for visualization. Since we're dealing with 800,000+ tweets we are going to limit the scope of this analysis to two major cities [Chennai](https://en.wikipedia.org/wiki/Chennai) and [Bangalore](https://en.wikipedia.org/wiki/Bangalore) and compare their features and trends.

In [None]:
from nltk.tokenize import TweetTokenizer 
from nltk.corpus import stopwords

twe = TweetTokenizer()
#Filtering the data to two large scale indian Cities Chennai and Bangalore
che = li[li['location'].isin(["Chennai","Bangalore"])]
che.time =  pd.to_datetime(che.time)

#Creating a new list of tokenized tweets for cleaning
chetoken = [twe.tokenize(t) for t in che.content]
stop = stopwords.words('english') #Removing Stop Words
chetoken = [[t for t in g if t not in stop] for g in chetoken]
chetoken = [[t for t in g if t.isalpha()] for g in chetoken] #Removing non alpha numeric characters
chetoken = [' '.join(t) for t in chetoken]


chetoken[:5] #Previewing the first five cleaned tweets

---

# 3. Plotting Number of Tweets per day by City
<a class="anchor" id="section3"></a>


In [None]:
#Plotting the number of daily tweets over observation period by city
vis1 = che[['time','location','content']].groupby(['time','location']).agg('count')
vis1 = vis1.reset_index()
plt.figure(figsize=(25,15))

ax1 = sns.relplot(data=vis1,x='time',y='content',hue='location',kind='line',style='location')
ax1.set_xticklabels( rotation=40, ha="right")
ax1.fig.suptitle('Number of Daity Tweets by City')
ax1.set_xlabels("Date")
ax1.set_ylabels('Number of COVID-19 tweets')




It appears that the timeseries of tweets posted in both cities are correlated and appear to peak between 15-APR and 15-MAY.

This is contextually relevant as India was subject to stringent lockdown measuers beginning 23-MAR and the lockdowns/curfews were extended in April, so it's possible that we're looking at some public consternation here. Let's put together a WordCloud to glance at the national sentiment.

---

# 4. Building a WordCloud with the entirety of Tweets
<a class="anchor" id="section4"></a>


To get an idea of what insights lay in the content of the tweets a WordCloud is a great way to sum up observations of most frequently occuring terms.

We accomplish this by using the tokenized list of tweets we generated earlier `chetoken` and proceeding to flatten it.

Then we upload a picture of a COVID protein spike and use it as a mask to shape our wordcloud

In [None]:
#Importing the packages necessary to build a wordcloud
from PIL import Image
from wordcloud import WordCloud


#Consilidating our corpus into a single text string for processing
singletext = ''.join(map(str, chetoken))

#Creating the mask(wordcloud shape) in the form of a COVID protien spike
covidmask =  np.array(Image.open('../input/covidimg/covidspike.jpg'))

#Putting the wordcloud together
wordcloud1 = WordCloud(width = 700,height=700,colormap='GnBu',
                       mask=covidmask,max_words=400,background_color = 'white')
wordcloud1.generate(singletext)
plt.figure(figsize=(35,23))
plt.imshow(wordcloud1, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

We can readily obeserve the word **lockdown** is featured prominently in the wordcloud along with phrases and words like positive case, govt and COVID.

This observation further points to what the tweet volume from mid April to mid May could have been about.


---

# 5. Using Logistic Regression to uncover Region Specific words 
<a class="anchor" id="section5"></a>

This will follow the same ML pipeline as your vanilla Binary Classification logistic regression with one twist:

We use a `CountVectorizer` sparse matrix as the feature to train and test the data instead of a pandas dataframe or series.


In [None]:
#Building a binomial classifier with the following packages for Logistic Regression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(strip_accents='ascii',lowercase=True,stop_words='english')
X_train, X_test, y_train, y_test = train_test_split(chetoken, che.location, test_size=0.25)
trvect = vectorizer.fit_transform(X_train)
tevect = vectorizer.transform(X_test)

trvect

In [None]:
lr = LogisticRegression(C=10,max_iter=3000)
lr.fit(trvect,y_train)
y_pred = lr.predict(tevect)

print('The Accuracy Score is : {0:2.2%}'.format(accuracy_score(y_test,y_pred)))
confusion_matrix(y_test,y_pred)

In [None]:
classification_report(y_test,y_pred,output_dict=True)

We've correctly classified about 72% of 39,000+ tweets by location as eminating from Chennai or Bangalore. Those are fair results, now on to the fun stuff of mining the data for insights.


---

# 6. Building a top words data frame
<a class="anchor" id="section6"></a>

We'll be extracting the log odds from the `coef_` method of `LogisticRegression` for this one.
Feature names can be gathered directly from the `CountVectorizer` object we created. 

From there on, we create a dataframe of the top 25 region specific words per city and move on to visualzation and interpretation. We narrowed the data down to 25 words per city as we're dealing with over 50,000 features in the entire corpus.


In [None]:
#Building a dataframe of top words associated by city of origin
features = dict(zip(vectorizer.get_feature_names(),np.exp(lr.coef_).reshape(-1,1)))
features = pd.DataFrame(features).T
features.columns = ['Odds']
features = features.sort_values('Odds')

#Streamlining the features to the top 25 region specific words per city
bow = vectorizer.vocabulary_
topwords = pd.concat([features.head(25), features.tail(25)])
topwords['frequency'] = [trvect[bow.get(t)].sum() for t in topwords.index]
topwords['city'] = np.where(topwords.Odds < 1,'Bangalore','Chennai')

#Examining Last 10 words of DataFrame
topwords.tail(10)

Since we're dealing with the odds in a binary classifier, we have a scenario where if any of the above words are used in a tweet, the odds are over 100:1 that the **tweet originated in Chennai as opposed to Bangalore** (i.e the probability of the tweet being from Chennai divided by the probability of it not being from Chennai)

As for the content, the words refer to names of local areas within the city or other Named Entities, so the information checks out content-wise.

In [None]:
#Similarly examining First 10 words of DataFrame
topwords.head(10)

Here we've got a list of Bangalore specific words, the Odds look different than they do from Chennai because they are the inverse of the above odds we looked at. (i.e if any of these words are used in a tweet, the Odds of it eminating from Chennai are 1:100)

This happens because Tweets labelled 'Chennai' comprise the positive class of the Classifier

---

# 7. Scatterplot and Closing Thoughts
<a class="anchor" id="section7"></a>

We're going to use a the `topwords` dataframe we looked at above to construct an annotated Scatterplot to better visualize the previously tabulated results.

Note that the original dataset had tweets from ALL Indian Cities which were filtered to Chennai and Bangalore. 

The code written through this kernel can be converted to a function to return region specific terms for any given pair of Cities or locales.

Thanks for hopping along for the ride!Feel free to comment as always and, Happy Coding!

In [None]:
topwords['logodds'] = np.log(topwords.Odds)

plt.figure(figsize=(20,10))
ax2 = sns.scatterplot(x='frequency',y = 'logodds',hue='city',data=topwords, palette="Set2")
ax2.set_title('Most Region specific words used by Chennaites and Bangaloreans')
for line in range(0,topwords.shape[0]):
     ax2.text(topwords.frequency[line]+0.2, topwords.logodds[line], topwords.index[line],
              horizontalalignment='left', size='small', color='black')


---
