In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import cm
import json
from urllib import request
import unicodedata
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from nltk.sentiment import SentimentIntensityAnalyzer
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Loading Tweet Dataset**

In [None]:
df = pd.read_csv('../input/indianeedsoxygen-tweets/IndiaWantsOxygen.csv')

In [None]:
df.info()

# **Droping columns that are not required**

In [None]:
columns_to_drop = ['user_name','user_description','user_favourites','user_friends','user_created','user_verified']
df.drop(columns=columns_to_drop,inplace=True)

In [None]:
df.head(3)

# **Droping null values**

In [None]:
null_info=df.isnull().sum()
null_info = pd.DataFrame(data=null_info,index=null_info.index,columns=['Null Count'])
null_info

In [None]:
index = null_info[null_info['Null Count'] > 0].index.values
df.dropna(subset = list(index), inplace=True)
df=df.reset_index(drop=True)
df.shape

# **Loading world cities name dataset**

In [None]:
cities_df = pd.read_csv('../input/world-cities-database/worldcitiespop.csv',usecols=['Country','City'])
cities_df.dropna(inplace=True)

# **Replacing city with its country name**

In [None]:
def remove_accents(cities):
    def remove(city):
        nfkd_form = unicodedata.normalize('NFKD', city)
        byte_city = nfkd_form.encode('ASCII', 'ignore')
        city = byte_city.decode("utf-8")
        return city
    cities = list(map(remove,cities))
    cities = list(map(lambda x: x.lower(),cities))
    return cities
    
def get_cities(country_name):
    cities = cities_df[cities_df['Country'].isin(country_name)]['City']
    return cities

country_dict = {'in': [],'pk': [],'gb': [],'us': [],'ca': [],
                'au': [],'bd': [],'np': [],'ru': [],
                'de': [],'fr': []}
countries = {'india':'in', 'pakistan':'pk', 'united kingdom':'gb', 'united states':'us',
             'canada':'ca', 'australia':'au', 'bangladesh':'bd', 'nepal':'np',
             'russia':'ru',  'france':'fr'}
for country in country_dict.keys():
    cities = get_cities([country])
    cities = remove_accents(cities)
    country_dict[country] = cities
country_dict['gb'].append('london')
country_dict['ca'].append('toronto')
pk_cities = ['لاہور, پاکستان',   'پاکستان', 'اسلام آباد, پاکستان','کراچی, پاکستان']
for city in pk_cities: 
    country_dict['pk'].append(city)

In [None]:
def replace_with_country(country,cities,country_list):# needs imporvement
    def is_countrys_city(x):
        if country in x.lower():
            return country
        if x.lower() in cities:
            return country
        location=x.split(',')
        if isinstance(location,list):
            if location[0].lower() in cities:
                return country
        return x
    locations = df[~df['user_location'].isin(country_list)]['user_location']
    updated_locations = locations.apply(is_countrys_city)
    df.iloc[updated_locations.index,0] = updated_locations
    country_list.append(country)

country_list = []
for country in ['india','pakistan','united kingdom','united states','canada','australia','bangladesh','nepal','russia','france']:
    replace_with_country(country,country_dict[countries[country]],country_list)
    

In [None]:
plt.figure(figsize=(10,8))
sns.set(style='darkgrid')
g=sns.countplot(y='user_location', data=df, order=df['user_location'].value_counts().index[:8],palette="muted")
plt.xlabel("Number of Tweets", weight='bold')
plt.ylabel("User Location", weight='bold')
x_cor = df['user_location'].value_counts()[:8].values
y_cor = range(8)
def convert_to_K(x):
    x = str(x)
    if len(x) == 4:
        y=x[1]
        if int(x[2]) >=5:
            y=str(int(x[1])+1)  
        return x[0]+'.'+y+'K'
    if len(x) == 3:
        y=x[1]
        if int(x[2]) >=5:
            y=str(int(x[1])+1)  
        return '0'+'.'+x[0]+y+'K'
    if len(x) == 2:
        y=x[0]
        if int(x[1]) >=5:
            y=str(int(x[0])+1)  
        return '0'+'.'+'0'+y+'K'
        
values = list(map(convert_to_K,x_cor))
for Y, X,value in list(zip(y_cor,x_cor,values)):
    g.text(y=Y,x=X,s=value, color='black', va="top")
plt.title('Tweet volume of top 8 countries',weight='bold')
plt.show()

# **Word cloud of hashtags**

In [None]:
hashtags = df['hashtags'].values
hashtags_text = ' '.join(hashtags)
hashtags_text = hashtags_text.replace('[','').replace(']','').replace('\'','').replace(',','')
mask = np.array(np.array(Image.open('../input/hashtag3/hashtag.jpeg')))
wordcloud = WordCloud(max_font_size=200, max_words=1500, background_color="black",collocations=False, mask=mask,colormap='Greys').generate(hashtags_text)
plt.figure(figsize=(19,19))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
df.date=pd.to_datetime(df.date)
days=df.date.dt.day
df['day']=days
ts_df=df['day'].value_counts().reset_index()
plt.figure(figsize=(25,7))
sns.set_theme(style="darkgrid")
sns.lineplot(x="index", y="day",data=ts_df)
plt.xlabel("Day of April", weight='bold')
plt.ylabel("# Tweets", weight='bold')
plt.title('Tweet count/Day',weight='bold')
plt.show()

In [None]:
def concat(country,ts_df,arr):
    temp = df[df['user_location'].isin([country])]['day'].value_counts().reset_index()
    ts_df['day'] = temp['index']
    ts_df['day count'] = temp['day']
    ts_df['country'] = [country] * temp.shape[0]
    arr.append(ts_df)
    return arr
def timeseries_top_8_countries():
    arr=[]
    for x in countries.keys():
        ts_df=pd.DataFrame(columns=['day','day count','country'])
        arr = concat(x,ts_df,arr)
    return pd.concat(arr)
top_8_ts_df=timeseries_top_8_countries()

plt.figure(figsize=(25,10))
sns.set_theme(style="darkgrid")
sns.lineplot(x="day", y="day count", hue="country",data=top_8_ts_df)
plt.xlabel("Day of April", weight='bold')
plt.ylabel("# Tweets", weight='bold')
plt.title('Tweet count/Day of top 8 countries by tweet volume',weight='bold')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
source_df = df.source.value_counts()[:4]
names = source_df.index
names=[x.replace('Twitter for','') for x in names]
names=[x.replace('Twitter','') for x in names]
size = source_df.values
my_circle = plt.Circle((0,0), 0.7, color='white')
plt.pie(size, labels=names,colors=cm.get_cmap('Set3').colors[5:10])
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.title('Platform used for tweets',weight='bold')
plt.show()

# **Sentimental analysis of tweets**

In [None]:
sia = SentimentIntensityAnalyzer()
def get_sentiment_val(tweet):
    if sia.polarity_scores(tweet)["compound"] > 0:
        return True,sia.polarity_scores(tweet)['pos']
    return False,0
result = df['text'].apply(get_sentiment_val)
def create_column_pos_tweet(x):
    return x[0]
def create_column_pos_val(x):
    return x[1]
df['positive_tweet'] = result.apply(create_column_pos_tweet)
df['pos_value'] = result.apply(create_column_pos_val)
def get_neg_val(tweet):
    if sia.polarity_scores(tweet)["compound"] < 0:
        return sia.polarity_scores(tweet)['neg']
    return 0
df['neg_value'] = df['text'].apply(get_neg_val)

In [None]:
tweets_sentiment_count = []
tweets_sentiment_count.append(df['positive_tweet'].value_counts())
top_five = list(countries.keys())
top_five = top_five[:5]
for country in top_five[:5]:
    tweets_sentiment_count.append(df[df['user_location'].isin([country])]['positive_tweet'].value_counts())

In [None]:
plt.figure(figsize=(10,10))
fig, ax = plt.subplots(2, 3, sharex='col', sharey='row',figsize=(20,10))
i=0
for x in range(2):
    for y in range(3):
        f_count=tweets_sentiment_count[i].loc[False]
        t_count=tweets_sentiment_count[i].loc[True]
        Y=tweets_sentiment_count[i].values
        X=list(tweets_sentiment_count[i].index)
        if t_count > f_count:
            c1 = 'green'
            c2 = 'red'
           
        else:
            c2 = 'green'
            c1 = 'red'
          
        ax[x,y].bar(tweets_sentiment_count[i].index,tweets_sentiment_count[i].values,color=[c1,c2])
        for y_loc,x_loc,value in list(zip(Y,X,Y)):
            ax[x,y].text(y=y_loc,x=x_loc,s=value, color='black', ha="center")
        ax[x,y].get_xaxis().set_visible(False)
        ax[x,y].get_yaxis().set_visible(False)
        red_patch = mpatches.Patch(color='red', label='Negative')
        green_patch = mpatches.Patch(color='green', label='Positive')
        ax[0,1].legend(handles=[green_patch,red_patch])
        i+=1
        
titles=['Total']
for x in top_five:
    titles.append(x.capitalize())
i=0
for x in range(2):
    for y in range(3):
        ax[x,y].set_title(titles[i])
        i+=1
fig.suptitle('Tweets\' Sentiment Count',weight='bold')
fig.show()

In [None]:
df['user_followers'].describe()

In [None]:
fig, ax = plt.subplots(1, 4, sharex=True,figsize=(20,5))
temp = []
temp.append(df[df['user_followers']<=43].copy())
temp.append(df[(df['user_followers']>43) & (df['user_followers']<=184)].copy())
temp.append(df[(df['user_followers']>184) & (df['user_followers']<=837)].copy())
temp.append(df[df['user_followers']>837].copy())
visible=False
titles=['Followers <=43','43<Followers<=184','184<Followers<=837','Followers>837']
i=0
for x,y in enumerate(temp):
    corr_mat = y.loc[:,['user_followers','pos_value','neg_value']].corr()
    corr_matrix = corr_mat.iloc[:,[0]]
    if x==3:
        visible=True
    sns.heatmap(corr_matrix, annot=True,cmap="YlOrBr",ax=ax[x],cbar=visible,fmt='.2f',linewidths=0.1,linecolor='gray')
    ax[x].set_title(titles[i],weight='bold')
    i+=1
    if x !=0:
        ax[x].get_yaxis().set_visible(False)

plt.style.use('bmh')
plt.show()