In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Datasets/processed_train.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,text,sentiment,Time of Tweet,Country,Population -2020,Density (P/Km²),word_count,char_count
0,"I`d have responded, if I were going",neutral,morning,Afghanistan,38928346,60,7,36
1,Sooo SAD I will miss you here in San Diego!!!,negative,noon,Albania,2877797,105,10,46
2,my boss is bullying me...,negative,night,Algeria,43851044,18,5,25
3,what interview! leave me alone,negative,morning,Andorra,77265,164,5,31
4,"Sons of ****, why couldn`t they put them on t...",negative,noon,Angola,32866272,26,14,75


## Basic Information

In [3]:
# Shape
print(f"Shape of Dataset is {df.shape}")

Shape of Dataset is (27481, 8)


In [4]:
# Columns in Dataset
print(df.columns)

Index(['text', 'sentiment', 'Time of Tweet', 'Country', 'Population -2020',
       'Density (P/Km²)', 'word_count', 'char_count'],
      dtype='object')


In [5]:
# Information
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   text              27480 non-null  object
 1   sentiment         27481 non-null  object
 2   Time of Tweet     27481 non-null  object
 3   Country           27481 non-null  object
 4   Population -2020  27481 non-null  int64 
 5   Density (P/Km²)   27481 non-null  int64 
 6   word_count        27481 non-null  int64 
 7   char_count        27481 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 1.7+ MB
None


In [6]:
# Describing Number type
df.describe(exclude=object)

Unnamed: 0,Population -2020,Density (P/Km²),word_count,char_count
count,27481.0,27481.0,27481.0,27481.0
mean,40184970.0,357.686583,12.902442,68.352571
std,150494600.0,2013.750702,6.926185,35.62595
min,801.0,2.0,1.0,3.0
25%,1968001.0,35.0,7.0,39.0
50%,8655535.0,89.0,12.0,64.0
75%,28435940.0,214.0,18.0,97.0
max,1439324000.0,26337.0,33.0,159.0


In [7]:
# Describing Object Type
df.describe(include=['object', 'bool'])

Unnamed: 0,text,sentiment,Time of Tweet,Country
count,27480,27481,27481,27481
unique,27480,3,3,195
top,All this flirting going on - The ATG smiles...,neutral,morning,Afghanistan
freq,1,11118,9161,149


In [8]:
# Value counts
print(f"Counts of Sentiments are \n {df['sentiment'].value_counts()}")

print("="*20)

print(f"Fraction of Sentiments are \n {df['sentiment'].value_counts(normalize=True)}")

print("="*20)

print(f"Counts of tweets based on time \n {df['Time of Tweet'].value_counts()}")

print("="*20)

print(f"Counts of Tweets by country \n {df['Country'].value_counts()}")

Counts of Sentiments are 
 sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64
Fraction of Sentiments are 
 sentiment
neutral     0.404570
positive    0.312288
negative    0.283141
Name: proportion, dtype: float64
Counts of tweets based on time 
 Time of Tweet
morning    9161
noon       9160
night      9160
Name: count, dtype: int64
Counts of Tweets by country 
 Country
Afghanistan    149
Albania        149
Algeria        149
Andorra        149
Angola         149
              ... 
Venezuela      127
Vietnam        127
Yemen          127
Zambia         127
Zimbabwe       127
Name: count, Length: 195, dtype: int64


In [9]:
# Sorting by columns (can sort using multiple columns)
df.sort_values(by=['word_count'], ascending=True).head()

Unnamed: 0,text,sentiment,Time of Tweet,Country,Population -2020,Density (P/Km²),word_count,char_count
68,Chilliin,positive,night,Guatemala,17915568,167,1,8
42,MAYDAY?!,neutral,morning,Cuba,11326616,106,1,8
10363,thanks,positive,noon,Nepal,29136808,203,1,8
24069,G`night!,neutral,morning,Singapore,5850342,8358,1,11
11658,Goodnight!!!!!!,positive,morning,Nauru,10824,541,1,16


In [10]:
# Mean of Word Count
print(f"Mean of Word_count columns is {df['word_count'].mean()}")

# Mean of Word Count
print(f"Median of Word_count columns is {df['word_count'].median()}")

# Most Frequent Value in Sentiment
print(f"The most frequent sentiment is {df['sentiment'].mode()}")

Mean of Word_count columns is 12.902441686983734
Median of Word_count columns is 12.0
The most frequent sentiment is 0    neutral
Name: sentiment, dtype: object


In [11]:
# Neutral tweets posted in the morning
print(f"Length is {len(df[(df['sentiment'] == 'neutral') & (df['Time of Tweet'] == 'morning')])}")

Length is 3763


In [12]:
# Applying function to a column
def country_char_count(text):
    c = 0
    for i in text:
        c += 1
    return c

df['country_char_count'] = df['Country'].apply(country_char_count)
df[['Country', 'country_char_count']].head()

Unnamed: 0,Country,country_char_count
0,Afghanistan,11
1,Albania,7
2,Algeria,7
3,Andorra,7
4,Angola,6


In [13]:
# Applying function to row
# Row where Country name starts with 'I'
df[df['Country'].apply(lambda country: country[0] == 'I')].head()

Unnamed: 0,text,sentiment,Time of Tweet,Country,Population -2020,Density (P/Km²),word_count,char_count,country_char_count
76,"WOW, i AM REALLY MiSSiN THE FAM(iLY) TODAY. BA...",negative,noon,Iceland,341243,3,9,50,7
77,My sources say no,neutral,night,India,1380004385,464,4,18,5
78,I am sooo tired,negative,morning,Indonesia,273523615,151,4,15,9
79,"Hey, you change your twitter account, and you...",neutral,noon,Iran,83992949,52,12,69,4
80,THANK YYYYYYYYYOOOOOOOOOOUUUUU!,positive,night,Iraq,40222493,93,2,32,4


In [14]:
# Map Function
sentiment_values = {
    "negative" : -1,
    "neutral" : 0,
    "positive" : 1
}

df['sentiment_value'] = df['sentiment'].map(sentiment_values)

# Using Replace

time_values = {
    'morning' : 'M',
    'noon' : 'N',
    'night' : 'Ni'
}
df['time_value'] = df['Time of Tweet'].replace(time_values)
df[['sentiment', 'sentiment_value', 'Time of Tweet', 'time_value']].head()

Unnamed: 0,sentiment,sentiment_value,Time of Tweet,time_value
0,neutral,0,morning,M
1,negative,-1,noon,N
2,negative,-1,night,Ni
3,negative,-1,morning,M
4,negative,-1,noon,N


#### Grouping

In [15]:
df.head(3)

Unnamed: 0,text,sentiment,Time of Tweet,Country,Population -2020,Density (P/Km²),word_count,char_count,country_char_count,sentiment_value,time_value
0,"I`d have responded, if I were going",neutral,morning,Afghanistan,38928346,60,7,36,11,0,M
1,Sooo SAD I will miss you here in San Diego!!!,negative,noon,Albania,2877797,105,10,46,7,-1,N
2,my boss is bullying me...,negative,night,Algeria,43851044,18,5,25,7,-1,Ni


In [16]:
#df.groupby(by=['sentiment'])[['Population -2020', 'Density (P/Km²)']].describe()

df.groupby(by=['sentiment'])[['Population -2020', 'Density (P/Km²)']].agg(["mean", "std", "min", "max"])


Unnamed: 0_level_0,Population -2020,Population -2020,Population -2020,Population -2020,Density (P/Km²),Density (P/Km²),Density (P/Km²),Density (P/Km²)
Unnamed: 0_level_1,mean,std,min,max,mean,std,min,max
sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
negative,40170010.0,152300200.0,801,1439323776,370.607634,2072.60501,2,26337
neutral,39971860.0,148926700.0,801,1439323776,363.194729,2058.009058,2,26337
positive,40474620.0,150885000.0,801,1439323776,338.835703,1898.439638,2,26337


## Preprocessing

#### Handling Missing Values

#### Conversion of Datatypes

#### Handling Duplicates

#### Handling Outliers

#### Exploratory Data Analysis

#### Train Test Split

#### Feature Engineering

#### Feature Selection

#### Encoding Categorical Data

#### Pipeline and ColumnTransformer

#### Model Selection

#### Model Training

#### Model Evaluation