In [428]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Importing Python Libraries

In [429]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Loading Test and Train Data


In [430]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [431]:
train.sentiment.value_counts()
pd.set_option('display.max_colwidth', -1)

  


Cleaning up data

In [432]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
train['message'] = train['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)
train['message'] = train['message'].str.lower()
test['message'] = test['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)
test['message'] = test['message'].str.lower()
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,"polyscimajor epa chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? url-web via @mashable",625221
1,1,it's not like we lack evidence of anthropogenic global warming,126103
2,2,rt @rawstory: researchers say we have three years to act on climate change before it’s too late url-web url-web…,698562
3,1,#todayinmaker# wired : 2016 was a pivotal year in the war on climate change url-web,573736
4,1,"rt @soynoviodetodas: it's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #electionnight",466954


Removing Punctuation

In [433]:
def remove_punctuation(message):
    return ''.join([l for l in message if l not in string.punctuation])
train['message']  = train['message'].apply(remove_punctuation)
test['message']  = test['message'].apply(remove_punctuation)
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,polyscimajor epa chief doesnt think carbon dioxide is main cause of global warming and wait what urlweb via mashable,625221
1,1,its not like we lack evidence of anthropogenic global warming,126103
2,2,rt rawstory researchers say we have three years to act on climate change before it’s too late urlweb urlweb…,698562
3,1,todayinmaker wired 2016 was a pivotal year in the war on climate change urlweb,573736
4,1,rt soynoviodetodas its 2016 and a racist sexist climate change denying bigot is leading in the polls electionnight,466954


Upsampling Training Dataset

In [434]:
from sklearn.utils import resample
believe = train[train['sentiment'] == 1]
not_believe = train[train['sentiment'] == -1]
neutral = train[train['sentiment'] == 0]
news = train[train['sentiment'] == 2]
not_believe_upsampled = resample(not_believe,
                               replace=True, 
                               n_samples=len(believe),
                               random_state=27) 
neutral_upsampled = resample(neutral,
                               replace=True, 
                               n_samples=len(believe), 
                               random_state=27) 
news_upsampled = resample(news,
                               replace=True,
                               n_samples=len(believe),
                               random_state=27)
upsampled = pd.concat([believe, not_believe_upsampled,neutral_upsampled,news_upsampled])

In [435]:
upsampled.sentiment.value_counts()

-1    8530
 2    8530
 1    8530
 0    8530
Name: sentiment, dtype: int64

Splitting out the X variable from the target

In [436]:
y = upsampled['sentiment']
X = upsampled['message']

Changing from text to number using vectorizer

In [437]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words=None)
X_vectorized = vectorizer.fit_transform(X)

Splitting the training data into a training and validation set

In [438]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized, y, test_size=0.01, random_state=25)

Training the model and predict


In [439]:
logreg = LogisticRegression(solver = 'saga', C=5, fit_intercept=False, random_state=0)
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_val)

Checking the performance of our model on the validation set

In [440]:
f1_score(y_val, logreg_pred, average="macro")

0.9707542085376324

Preparing test set

In [441]:
testx = test['message']
test_vect = vectorizer.transform(testx)

Making predictions on the test set and adding a sentiment column to our original test df

In [442]:
y_pred = logreg.predict(test_vect)
test['sentiment'] = y_pred
test.head()

Unnamed: 0,message,tweetid,sentiment
0,europe will now be looking to china to make sure that it is not alone in fighting climate change… urlweb,169760,1
1,combine this with the polling of staffers re climate change and womens rights and you have a fascist state urlweb,35326,1
2,the scary unimpeachable evidence that climate change is already here urlweb itstimetochange climatechange zeroco2,224985,1
3,karoli morgfair osborneink dailykos \nputin got to you too jill \ntrump doesnt believe in climate change at all \nthinks its s hoax,476263,1
4,rt fakewillmoore female orgasms cause global warming\nsarcastic republican,872928,0


Creating an output csv for submission

In [443]:
test[['tweetid','sentiment']].to_csv('Raymond_Classification_Submission.csv', index=False)