In [55]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Read data files

In [56]:
testdf = pd.read_csv('../data/test.csv')

### Check original data

In [57]:
testdf.shape

(17197, 2)

#### Drop Duplicates

In [58]:
testdf.drop_duplicates(inplace = True)

In [59]:
testdf.shape

(17197, 2)

In [60]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 403.1+ KB


### Cleaning tweets

In [61]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/umbertoleone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Cleaning test data set 

In [62]:
#create empty list
corpus_test = []
# loop thru our train dataset 
for i in range (0, len(testdf)):
    tweet = testdf['tweet'][i]
    tweet = tweet.lower()
    tweet = re.sub('[^a-zA-Z]', ' ', tweet) #only alphabet
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER',  tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = tweet.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')+ list(punctuation) + ['AT_USER','URL', 'user']
    tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = ' '.join(tweet)
    corpus_test.append(tweet)

In [63]:
len(corpus_test)

17197

In [64]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


In [65]:
testdf['cleaned'] = np.array(corpus_test)

In [66]:
test = testdf.drop(columns=['id', 'tweet'])

In [67]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17197 entries, 0 to 17196
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   cleaned  17197 non-null  object
dtypes: object(1)
memory usage: 908.7+ KB


# Extracting Features

In [68]:
test.columns, test.shape

(Index(['cleaned'], dtype='object'), (17197, 1))

BOW

In [70]:
counter = joblib.load('../vectorizer.pkl')

In [71]:
from sklearn.feature_extraction.text import CountVectorizer
# bow_cv = CountVectorizer(max_features=1000)
# test_bow = bow_cv.fit(test['cleaned'])
test_bow = counter.transform(test['cleaned'])
test_bow = test_bow.toarray()

In [72]:
import joblib

In [75]:
model = joblib.load(open('classifier.pkl', "rb"))
model

GaussianNB()

In [76]:
test['Label'] = model.predict(test_bow)

test.head(5)

Unnamed: 0,cleaned,Label
0,studiolif aislif requir passion dedic willpow ...,0
1,white supremacist want everyon see new bird movi,0
2,safe way heal acn altwaystoh healthi heal,0
3,hp curs child book reserv alreadi ye harrypott...,0
4,rd bihday amaz hilari nephew eli ahmir uncl da...,0


### Try model against a user input

#### Tweet

I only have a minute. Sixty seconds it.Forced upon me, I did not choose it,But I know that I must use it. Give account if I abuse it.Suffer, if I lose it.

In [77]:
tweet = input("Enter tweet:")

Enter tweet:I only have a minute. Sixty seconds it.Forced upon me, I did not choose it,But I know that I must use it. Give account if I abuse it.Suffer, if I lose it.


In [78]:
print(tweet)

I only have a minute. Sixty seconds it.Forced upon me, I did not choose it,But I know that I must use it. Give account if I abuse it.Suffer, if I lose it.


In [79]:
user = pd.DataFrame(columns=['tweet'])

In [80]:
user = user.append({"tweet": tweet}, ignore_index=True)

In [81]:
user

Unnamed: 0,tweet
0,I only have a minute. Sixty seconds it.Forced ...


In [82]:
corpus_user = []
for i in range (0, len(user)):
    tweet = tweet.lower()
    tweet = re.sub('[^a-zA-Z]', ' ', tweet) #only alphabet
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER',  tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    tweet = tweet.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')+ list(punctuation) + ['AT_USER','URL', 'user']
    tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = ' '.join(tweet)
    corpus_user.append(tweet)

Extract Features

In [83]:
corpus_user

['minut sixti second forc upon choos know must use give account abus suffer lose']

In [84]:
user['cleaned'] = np.array(corpus_user)

In [85]:
user = user.drop(columns=['tweet'])

In [86]:
# counter = joblib.load(open('../vectorizer.pkl', "rb"))

In [87]:
from sklearn.feature_extraction.text import CountVectorizer
# bow_cv = CountVectorizer(max_features=1000)
# user_bow = bow_cv.fit(user['cleaned'])
user_bow = counter.transform(user['cleaned'])
user_bow = user_bow.toarray()
user_bow.shape

(1, 1000)

In [88]:
# user_tfidf = user_tfidf.reshape(-1, 1)
predictions = model.predict(user_bow)
# x = model.predict(user_tfidf)
predictions

array([1])