#Discovering the Data

In [1]:
import pandas as pd
import numpy as np


In [3]:
new_york_tweets = pd.read_json("new_york.json", lines=True)

In [4]:
len(new_york_tweets)

4723

In [5]:
new_york_tweets.columns

Index(['created_at', 'id', 'id_str', 'text', 'display_text_range', 'source',
       'truncated', 'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
       'contributors', 'is_quote_status', 'quote_count', 'reply_count',
       'retweet_count', 'favorite_count', 'entities', 'favorited', 'retweeted',
       'filter_level', 'lang', 'timestamp_ms', 'extended_tweet',
       'possibly_sensitive', 'quoted_status_id', 'quoted_status_id_str',
       'quoted_status', 'quoted_status_permalink', 'extended_entities',
       'withheld_in_countries'],
      dtype='object')

In [9]:
new_york_tweets.loc[0:2]['text']

0                     @DelgadoforNY19 Calendar marked.
1      petition to ban more than one spritz of cologne
2    People really be making up beef with you in th...
Name: text, dtype: object

In [10]:
london_tweets  = pd.read_json("london.json", lines=True)
len(london_tweets)

5341

In [11]:
london_tweets .loc[0:2]['text']

0    @bbclaurak i agree Laura but the Party you see...
1                                @masturbacaolove Why?
2    @JackRobinson80 @pgroresearch Yeah not great b...
Name: text, dtype: object

In [13]:
paris_tweets = pd.read_json("paris.json", lines=True)
len(paris_tweets)

2510

In [15]:
paris_tweets.loc[0:2]['text']

0    Bulletin météo parisien : des grêlons énormes ...
1    Prêt pour le match #USORCL https://t.co/V5jw0S...
2    MAIS QOIDBDNND'SLS'SLSLLSLS''D DBDODNDNODJDBKD...
Name: text, dtype: object

In [29]:
new_york_text = new_york_tweets['text'].tolist()
london_text  = london_tweets['text'].tolist()
paris_text  = paris_tweets['text'].tolist()
all_tweets = new_york_text + london_text + paris_text

In [41]:
labels = [0] * len(new_york_text) + [1] * len(london_text) + [2] * len(paris_text)

# Making a Training and Test Set

In [36]:
from sklearn.model_selection import train_test_split

In [40]:
x_train, x_test, y_train, y_test = train_test_split(all_tweets, labels, test_size = 0.2, random_state = 42)
print(len(x_train))
print(len(x_test))

10059
2515


#Making the Count Vectors



In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [44]:
counter = CountVectorizer()
counter.fit(x_train)
train_counts = counter.transform(x_train)
test_counts = counter.transform(x_test)

In [48]:
print(x_train[0])
print(train_counts[0])

💦 Ensure your lashes are always clean, especially in this hot weather 💦
.
.
To book your appointment, click the lin… https://t.co/7JHNrfFaSm
  (0, 1142)	1
  (0, 2330)	1
  (0, 2764)	1
  (0, 2838)	1
  (0, 4464)	1
  (0, 6115)	1
  (0, 6136)	1
  (0, 6214)	1
  (0, 9384)	1
  (0, 9563)	1
  (0, 12951)	1
  (0, 13036)	1
  (0, 13537)	1
  (0, 15710)	1
  (0, 16166)	1
  (0, 26597)	1
  (0, 26806)	1
  (0, 27066)	1
  (0, 29189)	1
  (0, 30194)	2


# Training our model

In [51]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(train_counts, y_train)
preds = model.predict(test_counts)
preds

array([2, 1, 1, ..., 1, 2, 2])

#Evaluating our model

In [52]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

0.7145129224652087
[[607 352  12]
 [204 835  28]
 [ 30  92 355]]


In [80]:
exp = "Eiffel Tower"
tweet_counts = counter.transform([exp])
print(model.predict(tweet_counts))

[2]


In [79]:
exp = "Donald Trump"
tweet_counts = counter.transform([exp])
print(model.predict(tweet_counts))

[0]


In [75]:
exp = "Boxing day"
tweet_counts = counter.transform([exp])
print(model.predict(tweet_counts))

[1]
