## Import modules, load data, split 

In [1]:
import csv
import json
import pandas as pd
import numpy as np
import re

import nltk
from nltk import Text
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB, GaussianNB

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

## Importing tweets (the original data file)

In [2]:
#Pprint is 'pretty print', simply a print function that gives 'nicer' outputs than print
from pprint import pprint

file_test = 'geotagged_tweets_20160812-0912.jsons'
tweets_file = open(file_test, "r")
tweets_data = []

for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except Exception as e:
        print (e)
        continue

## Build pandas dataframe with text, language & country

In [49]:
tweets_test = pd.DataFrame()

tweets_test['text'] =    list(map(lambda tweet: tweet['text'], tweets_data))
tweets_test['lang'] =    list(map(lambda tweet: tweet['lang'], tweets_data))
tweets_test['type'] = list(map(lambda tweet: tweet['place']['place_type'] if tweet['place'] != None else None, tweets_data))
tweets_test['location'] = list(map(lambda tweet: tweet['place']['full_name'] if tweet['place'] != None else None, tweets_data))
tweets_test['country'] = list(map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data))

In [50]:
# tweets_test['text']
tweets_test.head()

Unnamed: 0,text,lang,type,location,country
0,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,und,city,"Frontenac, MO",United States
1,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,en,city,"Baton Rouge, LA",United States
2,@theblaze @realDonaldTrump https://t.co/n050DB...,und,city,"Frontenac, MO",United States
3,@HillaryClinton he will do in one year all the...,en,city,"Melbourne, Victoria",Australia
4,#CNN #newday clear #Trump deliberately throwin...,en,city,"Baltimore, MD",United States


In [51]:
tweets_test.iloc[12]

text        Can't stand @HillaryClinton or @realDonaldTrum...
lang                                                       en
type                                                     city
location                                       Middletown, KY
country                                         United States
Name: 12, dtype: object

## Read the training file from csv

In [52]:
file_train = 'kaggletweets.csv'
# file_test = 'tweetdata.csv'
cols = ['target', 'ids', 'date', 'flag', 'user', 'text']

tweets_train = pd.read_csv(file_train, delimiter=',', encoding='latin1', header=None, names=cols)
# tweets_test = pd.read_csv(file_test, delimiter=',', encoding='latin1')

## Clean tweets from hyperlinks and mentions

In [53]:
# training set
tweets_train.text = tweets_train.text.apply(lambda tweet: re.sub(r'https\S+', '', tweet))
tweets_train.text = tweets_train.text.apply(lambda tweet: re.sub(r'http\S+', '', tweet))
tweets_train.text = tweets_train.text.apply(lambda tweet: re.sub(r'@\S+', '', tweet))

# test set
tweets_test.text = tweets_test.text.apply(lambda tweet: re.sub(r'https\S+', '', tweet))
tweets_test.text = tweets_test.text.apply(lambda tweet: re.sub(r'http\S+', '', tweet))
# tweets_test.text = tweets_test.text.apply(lambda tweet: re.sub(r'@\S+', '', tweet))

In [54]:
# only want english tweets from us as the tfidfvectorizer will work only with english words
tweets_test = tweets_test[(tweets_test.lang == 'en') & (tweets_test.country == 'United States')]
tweets_test.head()

Unnamed: 0,text,lang,type,location,country
1,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,en,city,"Baton Rouge, LA",United States
4,#CNN #newday clear #Trump deliberately throwin...,en,city,"Baltimore, MD",United States
5,"@realDonaldTrump, you wouldn't recognize a lie...",en,city,"Palm Springs, CA",United States
7,"""Kid, you know, suing someone? Thats the most ...",en,city,"Secaucus, NJ",United States
8,@HillaryClinton you ARE the co-founder of ISIS...,en,city,"Irving, TX",United States


## Define training and test set with train_test_split
To assess the quality of the model we split the tweets from the Kaggle into training and test sets as well. THIS IS A DIFFERENT TEST SET THAT IS ONLY USED TO CHECK MODEL PERFORMANCE. When the model is trained we use 'tweets_test' to predict labels.

In [55]:
X_train, X_test, y_train, y_test = train_test_split(tweets_train.text, tweets_train.target,
                                                   train_size=0.1, test_size=0.02, random_state=42)

## Tf-idf Vectorizer settings

In [56]:
# remove stop words only, no stemming or lemmatization
# punctuation is completely ignored
word_vectorizer = TfidfVectorizer(
    stop_words='english',
    lowercase=True,
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{2,}',
    ngram_range=(1, 2))

## Fit vectorizer to the training set

In [57]:
word_vectorizer.fit(X_train)
# X_train_word_features = word_vectorizer.transform(X_train)
# test_features = word_vectorizer.transform(X_test)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{2,}', tokenizer=None, use_idf=True,
        vocabulary=None)

## Transform test set with fitted vectorizer

In [58]:
# not too sure if this is necessary as it wont work with predict
test_features = word_vectorizer.transform(tweets_test.text)

## Specify classifier and create pipeline

In [59]:
classifier = MultinomialNB()
pipe = make_pipeline(word_vectorizer, classifier)
param_grid = {'multinomialnb__alpha': [0.1, 0.3, 0.5, 0.7, 0.9]}
grid = GridSearchCV(pipe, param_grid, cv=5)

## Fit model with grid search

In [60]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_i...   vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'multinomialnb__alpha': [0.1, 0.3, 0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

## Predict labels for the test features

In [61]:
sentiment_predictions = grid.predict(tweets_test.text)

In [62]:
# 0 = neg, 2 = neutral, 4 = pos
tweets_test['target'] = sentiment_predictions
tweets_test.head()

Unnamed: 0,text,lang,type,location,country,target
1,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,en,city,"Baton Rouge, LA",United States,4
4,#CNN #newday clear #Trump deliberately throwin...,en,city,"Baltimore, MD",United States,0
5,"@realDonaldTrump, you wouldn't recognize a lie...",en,city,"Palm Springs, CA",United States,0
7,"""Kid, you know, suing someone? Thats the most ...",en,city,"Secaucus, NJ",United States,4
8,@HillaryClinton you ARE the co-founder of ISIS...,en,city,"Irving, TX",United States,0


## Write results to csv

In [63]:
outfile = 'tweet_classifications2.csv'
tweets_test.to_csv(outfile, sep=',', encoding='utf-8', index=False)

In [64]:
read_table = pd.read_csv('tweet_classifications2.csv')
read_table.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,text,lang,type,location,country,target
0,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,en,city,"Baton Rouge, LA",United States,4.0
1,#CNN #newday clear #Trump deliberately throwin...,en,city,"Baltimore, MD",United States,0.0
2,"@realDonaldTrump, you wouldn't recognize a lie...",en,city,"Palm Springs, CA",United States,0.0
3,"""Kid, you know, suing someone? Thats the most ...",en,city,"Secaucus, NJ",United States,4.0
4,@HillaryClinton you ARE the co-founder of ISIS...,en,city,"Irving, TX",United States,0.0


## Other data set & exploration