# Off-Platform Project: Classifying Tweets

This project uses a Naive Bayes Classifier to classify any tweet (or sentence) and predict whether that sentence came from New York, London, or Paris.

# Investigate the Data

In [2]:
import pandas as pd

new_york_tweets = pd.read_json("new_york.json", lines=True)
print(len(new_york_tweets))
print(new_york_tweets.columns)
print(new_york_tweets.loc[12]["text"])

4723
Index(['contributors', 'coordinates', 'created_at', 'display_text_range',
       'entities', 'extended_entities', 'extended_tweet', 'favorite_count',
       'favorited', 'filter_level', 'geo', 'id', 'id_str',
       'in_reply_to_screen_name', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'is_quote_status', 'lang', 'place',
       'possibly_sensitive', 'quote_count', 'quoted_status',
       'quoted_status_id', 'quoted_status_id_str', 'quoted_status_permalink',
       'reply_count', 'retweet_count', 'retweeted', 'source', 'text',
       'timestamp_ms', 'truncated', 'user', 'withheld_in_countries'],
      dtype='object')
Be best #ThursdayThoughts


In the code block below, load the London and Paris tweets into DataFrames named `london_tweets` and `paris_tweets`.

How many London tweets are there? How many Paris ones are there?

In [3]:
london_tweets = pd.read_json("london.json", lines=True)
paris_tweets = pd.read_json("paris.json", lines=True)

# Classifying using language: Naive Bayes Classifier

In [4]:
new_york_text = new_york_tweets["text"].tolist()
london_text = london_tweets["text"].tolist()
paris_text = paris_tweets["text"].tolist()

all_tweets = new_york_text + london_text + paris_text
labels = [0] * len(new_york_text) + [1] * len(london_text) + [2] * len(paris_text)

# Making a Training and Test Set

In [5]:
from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(all_tweets, labels, test_size =0.2, random_state = 1)
print(len(train_data))
print(len(test_data))

10059
2515


# Making the Count Vectors

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
counter = CountVectorizer()
counter.fit(train_data)

train_counts = counter.transform(train_data)
test_counts = counter.transform(test_data)

print(train_data[3], train_counts[3])

saying bye is hard. Especially when youre saying bye to comfort.   (0, 5022)	2
  (0, 6371)	1
  (0, 9552)	1
  (0, 12314)	1
  (0, 13903)	1
  (0, 23994)	2
  (0, 27146)	1
  (0, 29397)	1
  (0, 30274)	1


# Train and Test the Naive Bayes Classifier

In [13]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(train_counts, train_labels)
predictions = classifier.predict(test_counts)

# Evaluate the Model

In [16]:
from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, predictions))

0.6779324055666004


In [17]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_labels, predictions))

[[541 404  28]
 [203 824  34]
 [ 38 103 340]]


# Test Out a Random Tweet

In [22]:
tweet = 'I miss my friends.'

def predicted_city(tweet):
    tweet_counts = counter.transform([tweet])
    tweet_prediction = classifier.predict(tweet_counts)
    if tweet_prediction == [0]:
        print('The classifier predicts that this tweet is from New York.')
    elif tweet_prediction == [1]:
        print('The classifier predicts that this tweet is from London.')
    elif tweet_prediction == [2]:
        print('The classifier predicts this this tweet is from Paris.')

print(predicted_city(tweet))        

The classifier predicts that this tweet is from New York.
None
