In [2]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [297]:
all_tweets = pd.read_json('random_tweets.json', lines = True)

In [298]:
print(len(all_tweets))
print(all_tweets.columns)
print(all_tweets.info())

11099
Index(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities',
       'metadata', 'source', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo',
       'coordinates', 'place', 'contributors', 'retweeted_status',
       'is_quote_status', 'retweet_count', 'favorite_count', 'favorited',
       'retweeted', 'lang', 'possibly_sensitive', 'quoted_status_id',
       'quoted_status_id_str', 'extended_entities', 'quoted_status',
       'withheld_in_countries'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11099 entries, 0 to 11098
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   created_at                 11099 non-null  datetime64[ns, UTC]
 1   id                         11099 non-null  int64              
 2   id_str        

In [325]:
all_tweets.text.loc[4455]

'RT @UserExperienceU: #Open #IoT with #Blockchain #AI and #BigData #Futurist #IoT #BlockChain #Agile #DevOps  https://t.co/CpADsibIXv https:…'

In [329]:
symbols = []
for i in range(0, 110):
    symbols.append(all_tweets['entities'].loc[i]['symbols'])


In [335]:
symbols[107]

[{'text': 'COHO', 'indices': [4, 9]}]

In [336]:
all_tweets['text'].loc[107]

'WOW $COHO on the move 📈🔥'

#### Defining a Viral Tweet

In [309]:
q3_retweet = all_tweets['retweet_count'].quantile(0.75)

In [310]:
all_tweets['is_viral'] = np.where(all_tweets['retweet_count'] > q3_retweet, 1, 0)

In [311]:
print(all_tweets['is_viral'].value_counts())

is_viral
0    8324
1    2775
Name: count, dtype: int64


#### Making Features for Model

In [312]:
all_tweets['user'].loc[1]

{'id': 2407992339,
 'id_str': '2407992339',
 'name': 'indecent exposure',
 'screen_name': 'alyssamajor9',
 'location': 'Sherbrooke, Québec',
 'description': 'Iifes a journey enjoy it❤️',
 'url': 'https://t.co/Q7UGSdRBOO',
 'entities': {'url': {'urls': [{'url': 'https://t.co/Q7UGSdRBOO',
     'expanded_url': 'https://www.instagram.com/alyssa.major/',
     'display_url': 'instagram.com/alyssa.major/',
     'indices': [0, 23]}]},
  'description': {'urls': []}},
 'protected': False,
 'followers_count': 199,
 'friends_count': 203,
 'listed_count': 1,
 'created_at': 'Thu Mar 13 17:13:40 +0000 2014',
 'favourites_count': 2136,
 'utc_offset': None,
 'time_zone': None,
 'geo_enabled': True,
 'verified': False,
 'statuses_count': 3922,
 'lang': 'en',
 'contributors_enabled': False,
 'is_translator': False,
 'is_translation_enabled': False,
 'profile_background_color': '000000',
 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',
 'profile_background_image_url_htt

In [313]:
all_tweets['verified'] = all_tweets.apply(lambda tweet: tweet['user']['verified'], axis=1)

In [314]:
all_tweets['tweet_length'] = all_tweets.apply(lambda tweet: len(tweet['text']), axis=1)

In [315]:
all_tweets['number_words'] = all_tweets.apply(lambda tweet: len(tweet['text'].split()), axis = 1)

In [316]:
all_tweets['followers_count'] = all_tweets.apply(lambda tweet: tweet['user']['followers_count'], axis = 1)

In [317]:
all_tweets['friends_count'] = all_tweets.apply(lambda tweet: tweet['user']['friends_count'], axis = 1)

In [318]:
all_tweets['number_hashtags'] = all_tweets.apply(lambda tweet: tweet['text'].count('#'), axis = 1)

In [319]:
all_tweets['number_links'] = all_tweets.apply(lambda tweet: tweet['text'].count('http'), axis = 1)

In [324]:
all_tweets['number_hashtags'].argmax()

4455

In [320]:
labels = all_tweets['is_viral']

In [321]:
data = all_tweets[['tweet_length', 'followers_count', 'friends_count', 'number_hashtags', 'number_links', 
                   'is_quote_status', 'truncated']]

In [286]:
data.describe()

Unnamed: 0,tweet_length,followers_count,friends_count,number_hashtags,number_links
count,11099.0,11099.0,11099.0,11099.0,11099.0
mean,122.833589,6009.168,1442.336337,0.232543,0.412379
std,27.850477,201314.4,7645.949991,0.725709,0.525913
min,9.0,0.0,0.0,0.0,0.0
25%,110.0,131.0,194.0,0.0,0.0
50%,140.0,403.0,442.0,0.0,0.0
75%,140.0,1249.0,1116.0,0.0,1.0
max,155.0,20211860.0,510292.0,10.0,4.0


#### Normalizing Data

In [235]:
from sklearn.preprocessing import StandardScaler

In [287]:
scale = StandardScaler()

In [288]:
scale.fit(data)
scaled_data = scale.transform(data)

In [289]:
scaled_data = pd.DataFrame(scaled_data, columns=data.columns)

#### Split into Training and Testing Sets

In [239]:
from sklearn.model_selection import train_test_split

In [348]:
X_train, X_test, y_train, y_test = train_test_split(scaled_data, labels, test_size = 0.33, random_state= 101)

#### Creating The Classifier

In [344]:
from sklearn.neighbors import KNeighborsClassifier

In [349]:
classifier = KNeighborsClassifier(n_neighbors=8)

In [350]:
classifier.fit(X_train, y_train)

#### Evaluation

In [351]:
classifier.score(X_test, y_test)

0.7562107562107562

In [352]:
y_pred = classifier.predict(X_test)

In [353]:
from sklearn.metrics import classification_report, confusion_matrix

In [354]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85      2718
           1       0.56      0.25      0.34       945

    accuracy                           0.76      3663
   macro avg       0.67      0.59      0.60      3663
weighted avg       0.72      0.76      0.72      3663



In [355]:
print(confusion_matrix(y_test, y_pred))

[[2538  180]
 [ 713  232]]
