In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Twitter Dataset

In [2]:
twitter = pd.read_excel("twitter.xlsx")
twitter.head()

Unnamed: 0,title,content,from,locatiion,date,images,url,by,likes,shares
0,,Kalvin Phillips hopeful of World Cup inclusion...,myKhel.com,Bengaluru,2022-11-08 10:40:56,[],https://twitter.com/mykhelcom/status/158981016...,myKhel.com,0,0
1,,Qatar promises a 'carbon-neutral' FIFA World C...,Republic,"Mumbai, India",2022-11-08 10:58:55,[],https://twitter.com/republic/status/1589814697...,Republic,10,1
2,,Qatar's promise of 'carbon-neutral' World Cup ...,Carbon Credit Research,Worldwide,2022-11-08 11:08:09,[],https://twitter.com/CarbonCreditRes/status/158...,Carbon Credit Research,0,0
3,,"Around 6,000 Argentine Fans Banned from Stadiu...",Viral Cyprus,Worldwide,2022-11-08 12:00:50,[],https://twitter.com/viralposthq/status/1589830...,Viral Cyprus,0,0
4,,Happy World Cup Final Day!. Argentina to win 2...,MrX_NFT,,2022-12-18 16:21:57,[],https://twitter.com/MrX__NFT/status/1604391505...,MrX_NFT,0,0


In [3]:
twitter = twitter.drop_duplicates() #Eliminates duplicate values
twitter = twitter.drop(columns=["title", "images", "url", "by"])

new_column_names = {
    "content": "Title",
    "from": "Publisher",
    "locatiion": "Location",
    "date": "Date",
    "likes": "Likes",
    "shares": "Shares"
}
twitter = twitter.rename(columns=new_column_names)
twitter

Unnamed: 0,Title,Publisher,Location,Date,Likes,Shares
0,Kalvin Phillips hopeful of World Cup inclusion...,myKhel.com,Bengaluru,2022-11-08 10:40:56,0,0
1,Qatar promises a 'carbon-neutral' FIFA World C...,Republic,"Mumbai, India",2022-11-08 10:58:55,10,1
2,Qatar's promise of 'carbon-neutral' World Cup ...,Carbon Credit Research,Worldwide,2022-11-08 11:08:09,0,0
3,"Around 6,000 Argentine Fans Banned from Stadiu...",Viral Cyprus,Worldwide,2022-11-08 12:00:50,0,0
4,Happy World Cup Final Day!. Argentina to win 2...,MrX_NFT,,2022-12-18 16:21:57,0,0
...,...,...,...,...,...,...
27503,FIFA World Cup 2022: Sometimes things don’t go...,Daily News hunt 24,Mumbai,2022-11-28 21:24:41,0,0
27504,⚽️ FIFA World Cup Qatar 2022 Match schedule:⚽️...,K8 Official,,2022-11-29 03:13:17,5,1
27505,Please am I only one that has not yet watched ...,Anas zurmi,Nigeria.,2022-11-29 03:13:26,5,0
27506,This is what the #FIFAWorldCup is all about;\n...,FIFA World Cup Stats,Global �,2022-11-28 21:47:41,124,28


In [4]:
twitter.dtypes

Title                object
Publisher            object
Location             object
Date         datetime64[ns]
Likes                 int64
Shares                int64
dtype: object

In [5]:
twitter.shape

(27508, 6)

In [6]:
twitter.describe()

Unnamed: 0,Likes,Shares
count,27508.0,27508.0
mean,186.251854,45.223971
std,4939.797974,1552.419537
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,3.0,1.0
max,629808.0,236759.0


In [7]:
likes_percentiles = twitter["Likes"].describe(percentiles=[.50, .60, .70, .75, .80, .85, .90, .95, .96, .97, .98, .99, .995, .999])

print(likes_percentiles)

count     27508.000000
mean        186.251854
std        4939.797974
min           0.000000
50%           0.000000
60%           1.000000
70%           2.000000
75%           3.000000
80%           5.000000
85%          10.000000
90%          23.000000
95%          75.000000
96%         109.720000
97%         174.790000
98%         347.000000
99%        1307.670000
99.5%      4133.905000
99.9%     43192.497000
max      629808.000000
Name: Likes, dtype: float64


In [8]:
shares_percentiles = twitter["Shares"].describe(percentiles=[.50, .75,  .85, .90, .95, .96, .97, .98, .99, .995, .999])

print(shares_percentiles)

count     27508.000000
mean         45.223971
std        1552.419537
min           0.000000
50%           0.000000
75%           1.000000
85%           3.000000
90%           7.000000
95%          23.000000
96%          34.000000
97%          52.000000
98%         102.000000
99%         482.930000
99.5%      1381.000000
99.9%      7717.000000
max      236759.000000
Name: Shares, dtype: float64


In [9]:
publisher_counts = twitter['Publisher'].value_counts()

publisher_counts.head(10)

Gulf-Times              342
The Voice of America    121
beIN SPORTS             107
Robbie Sparkes           98
#YFM                     89
Metro                    83
AFP news agency          78
Sky News                 73
Andy Vermaut             56
AP Sports                53
Name: Publisher, dtype: int64

In [10]:
location_counts = twitter['Location'].value_counts()

location_counts.head(20)

London                    433
London, England           355
United States             348
Al Hilal - C Ring Road    342
Nigeria                   319
United Kingdom            311
London, UK                261
Doha, Qatar               232
India                     221
Lagos, Nigeria            213
South Africa              202
Australia                 193
New York, NY              185
Qatar                     156
Global                    142
Singapore                 136
Pakistan                  130
USA                       127
Nairobi, Kenya            126
Washington, DC            124
Name: Location, dtype: int64

In [21]:
date_counts = twitter['Date'].dt.date.value_counts()

print(date_counts)

2022-11-21    2802
2022-11-20    1909
2022-11-19    1329
2022-11-18    1089
2022-11-22     998
              ... 
2023-01-04       4
2023-01-08       3
2023-01-09       3
2023-01-07       2
2023-01-06       1
Name: Date, Length: 92, dtype: int64


In [29]:
twitter.isna().sum()

Title           0
Publisher       2
Location     8365
Date            0
Likes           0
Shares          0
dtype: int64

In [12]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/farrelw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/farrelw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
twitter['Title'] = twitter['Title'].apply(lambda text: ' '.join([word for word in word_tokenize(text) if word.lower() not in stop_words]))

In [14]:
twitter['Title'] = twitter['Title'].apply(word_tokenize)

In [15]:
twitter.head()

Unnamed: 0,Title,Publisher,Location,Date,Likes,Shares
0,"[Kalvin, Phillips, hopeful, World, Cup, inclus...",myKhel.com,Bengaluru,2022-11-08 10:40:56,0,0
1,"[Qatar, promises, 'carbon-neutral, ', FIFA, Wo...",Republic,"Mumbai, India",2022-11-08 10:58:55,10,1
2,"[Qatar, 's, promise, 'carbon-neutral, ', World...",Carbon Credit Research,Worldwide,2022-11-08 11:08:09,0,0
3,"[Around, 6,000, Argentine, Fans, Banned, Stadi...",Viral Cyprus,Worldwide,2022-11-08 12:00:50,0,0
4,"[Happy, World, Cup, Final, Day, !, ., Argentin...",MrX_NFT,,2022-12-18 16:21:57,0,0


In [36]:
twitter_sort = twitter.sort_values(by=['Likes'], ascending=False)

twitter_sort.head(20)

Unnamed: 0,Title,Publisher,Location,Date,Likes,Shares
11760,"[방탄소년단, ‘, 정국, ’, 이, 2022, FIFA, 카타르, 월드컵, 공식,...",BTS_official,,2022-11-12 13:00:01,629808,236759
21390,"[’, end, Cristiano, Ronaldo, World, Cup, ., Po...",Fabrizio Romano,"Milano, Italia",2022-12-11 01:02:03,231247,27972
25127,"[Lionel, Messi, :, “, ,, ’, gon, na, retire, n...",Fabrizio Romano,"Milano, Italia",2022-12-19 03:50:35,184694,16489
11710,"[�️, Abdullah, Al, Nasari, ,, Head, Security, ...",Football Tweet ⚽,team@thesocialsup.com,2022-11-12 16:33:00,149618,27087
10593,"[Outrageously, disrespectful, Qatar, BBC, ’, b...",Piers Morgan,"London, Newick, LA.",2022-11-21 01:20:41,133871,21023
10585,"[Jungkook, BTS, Qatari, singer, Fahad, Al, Kub...",The Peninsula Qatar,Qatar,2022-11-20 23:28:34,127116,53379
10933,"[FIFA, President, Gianni, Infantino, Europe, '...",ESPN FC,,2022-11-19 19:17:28,119784,20147
23387,"[unique, experience, comes, end, ., leave, Qat...",Achraf Hakimi,,2022-12-19 22:50:15,92355,6361
23213,"[Lionel, Andrés, Messi, 1,000th, senior, game,...",Fabrizio Romano,"Milano, Italia",2022-12-04 03:37:49,87855,7189
14690,"[Topless, World, Cup, fans, arrested, Qatar, r...",Daily Loud,"Pittsburgh, PA",2022-12-22 14:27:10,84232,3909


In [30]:
twitter_sort = twitter.sort_values(by=['Shares'], ascending=False)

twitter_sort.head(20)

Unnamed: 0,Title,Publisher,Location,Date,Likes,Shares
11760,"[방탄소년단, ‘, 정국, ’, 이, 2022, FIFA, 카타르, 월드컵, 공식,...",BTS_official,,2022-11-12 13:00:01,629808,236759
10585,"[Jungkook, BTS, Qatari, singer, Fahad, Al, Kub...",The Peninsula Qatar,Qatar,2022-11-20 23:28:34,127116,53379
21390,"[’, end, Cristiano, Ronaldo, World, Cup, ., Po...",Fabrizio Romano,"Milano, Italia",2022-12-11 01:02:03,231247,27972
11710,"[�️, Abdullah, Al, Nasari, ,, Head, Security, ...",Football Tweet ⚽,team@thesocialsup.com,2022-11-12 16:33:00,149618,27087
10593,"[Outrageously, disrespectful, Qatar, BBC, ’, b...",Piers Morgan,"London, Newick, LA.",2022-11-21 01:20:41,133871,21023
10412,"[RT, @, piersmorgan, :, Outrageously, disrespe...",NEIL EVANS,QUEENSCLIFF Sydney,2022-11-21 05:26:15,0,21023
10318,"[RT, @, piersmorgan, :, Outrageously, disrespe...",Chox,"Brancaster , Norfolk , UK",2022-11-21 03:02:35,0,20332
10934,"[RT, @, ESPNFC, :, FIFA, President, Gianni, In...",Chox,"Brancaster , Norfolk , UK",2022-11-20 00:53:47,0,20147
10933,"[FIFA, President, Gianni, Infantino, Europe, '...",ESPN FC,,2022-11-19 19:17:28,119784,20147
10845,"[RT, @, ESPNFC, :, FIFA, President, Gianni, In...",E Z R A,"Abuja, Nigeria",2022-11-20 04:32:20,0,19205


In [17]:
#Drop times