## Import modules, load data, split 

In [None]:
import csv
import json
import pandas as pd
import numpy as np
import re

import nltk
from nltk import Text
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB, GaussianNB

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

## Importing tweets (the original data file)

In [None]:
#Pprint is 'pretty print', simply a print function that gives 'nicer' outputs than print
from pprint import pprint

file_test = 'geotagged_tweets_20160812-0912.jsons'
tweets_file = open(file_test, "r")
tweets_data = []

for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except Exception as e:
        print (e)
        continue

## Build pandas dataframe with text, language & country

In [None]:
tweets_test = pd.DataFrame()

tweets_test['text'] =    list(map(lambda tweet: tweet['text'], tweets_data))
tweets_test['lang'] =    list(map(lambda tweet: tweet['lang'], tweets_data))
tweets_test['type'] = list(map(lambda tweet: tweet['place']['place_type'] if tweet['place'] != None else None, tweets_data))
tweets_test['location'] = list(map(lambda tweet: tweet['place']['full_name'] if tweet['place'] != None else None, tweets_data))
tweets_test['country'] = list(map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data))

In [None]:
# tweets_test['text']
tweets_test.head()

In [None]:
tweets_test.iloc[12]

## Read the training file from csv

In [None]:
file_train = 'kaggletweets.csv'
# file_test = 'tweetdata.csv'
cols = ['target', 'ids', 'date', 'flag', 'user', 'text']

tweets_train = pd.read_csv(file_train, delimiter=',', encoding='latin1', header=None, names=cols)
# tweets_test = pd.read_csv(file_test, delimiter=',', encoding='latin1')

## Clean tweets from hyperlinks and mentions

In [None]:
# training set
tweets_train.text = tweets_train.text.apply(lambda tweet: re.sub(r'https\S+', '', tweet))
tweets_train.text = tweets_train.text.apply(lambda tweet: re.sub(r'http\S+', '', tweet))
tweets_train.text = tweets_train.text.apply(lambda tweet: re.sub(r'@\S+', '', tweet))

# test set
tweets_test.text = tweets_test.text.apply(lambda tweet: re.sub(r'https\S+', '', tweet))
tweets_test.text = tweets_test.text.apply(lambda tweet: re.sub(r'http\S+', '', tweet))
# tweets_test.text = tweets_test.text.apply(lambda tweet: re.sub(r'@\S+', '', tweet))

In [None]:
# only want english tweets from us as the tfidfvectorizer will work only with english words
tweets_test = tweets_test[(tweets_test.lang == 'en') & (tweets_test.country == 'United States')]
tweets_test.head()

## Define training and test set with train_test_split
To assess the quality of the model we split the tweets from the Kaggle into training and test sets as well. THIS IS A DIFFERENT TEST SET THAT IS ONLY USED TO CHECK MODEL PERFORMANCE. When the model is trained we use 'tweets_test' to predict labels.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tweets_train.text, tweets_train.target,
                                                   train_size=0.1, test_size=0.02, random_state=42)

## Tf-idf Vectorizer settings

In [None]:
# remove stop words only, no stemming or lemmatization
# punctuation is completely ignored
word_vectorizer = TfidfVectorizer(
    stop_words='english',
    lowercase=True,
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{2,}',
    ngram_range=(1, 2))

## Fit vectorizer to the training set

In [None]:
word_vectorizer.fit(X_train)
# X_train_word_features = word_vectorizer.transform(X_train)
# test_features = word_vectorizer.transform(X_test)

## Transform test set with fitted vectorizer

In [None]:
# not too sure if this is necessary as it wont work with predict
test_features = word_vectorizer.transform(tweets_test.text)

## Specify classifier and create pipeline

In [None]:
classifier = MultinomialNB()
pipe = make_pipeline(word_vectorizer, classifier)
param_grid = {'multinomialnb__alpha': [0.1, 0.3, 0.5, 0.7, 0.9]}
grid = GridSearchCV(pipe, param_grid, cv=5)

## Fit model with grid search

In [None]:
grid.fit(X_train, y_train)

## Predict labels for the test features

In [None]:
sentiment_predictions = grid.predict(tweets_test.text)

In [None]:
# 0 = neg, 2 = neutral, 4 = pos
tweets_test['target'] = sentiment_predictions
tweets_test.head()

## Write results to csv

In [None]:
outfile = 'tweet_classifications2.csv'
tweets_test.to_csv(outfile, sep=',', encoding='utf-8', index=False)

In [None]:
read_table = pd.read_csv('tweet_classifications2.csv')
read_table.head()