In [None]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import TweetTokenizer
import nltk
nltk.download('stopwords')
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## FUNCTION TO LOAD DATASETS

In [None]:
def load_dataset(filename, cols):
    dataset = pd.read_csv(filename, encoding='latin-1')
    dataset.columns = cols
    return dataset

In [None]:
dataset = load_dataset("drive/My Drive/Project/train.txt", ['tweet_id', 'sentiment', 'tweet_text'])
dataset.head()

Unnamed: 0,tweet_id,sentiment,tweet_text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...


In [None]:
test_ds = load_dataset("drive/My Drive/Project/test.txt", ["tweet_id","tweet_text"])
test_ds.head()

Unnamed: 0,tweet_id,tweet_text
0,264238274963451904,"@jjuueellzz down in the Atlantic city, ventnor..."
1,218775148495515649,Musical awareness: Great Big Beautiful Tomorro...
2,258965201766998017,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...
3,262926411352903682,"Kapan sih lo ngebuktiin,jan ngomong doang Susa..."
4,171874368908050432,"Excuse the connectivity of this live stream, f..."


In [None]:
def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

## TEXT CLEANING FUNCTION

In [None]:
import re
from nltk.corpus import stopwords


def cleanTweet(tweet):
    
    
    tweet = re.sub(r'@[A-Za-z0-9_]+','',tweet)

    # Remove any word that start with http (links)
    tweet = re.sub('https?://[A-Za-z0-9./]+','',tweet)
    
    # Remove symbols
    tweet =  ''.join(ch for ch in tweet if ch.isalnum() or ch == ' ' or ch == '#')
    
    # Convert tweet into lowercase
    tweet = tweet.lower()
    
    return tweet

## APPLYING THE TEXT CLEAN FUNCTION TO TRAINING DATA

In [None]:
for index, row in dataset.iterrows():
    dataset.loc[index,'CleanText'] = cleanTweet(dataset.loc[index,'tweet_text'])

dataset.head()

Unnamed: 0,tweet_id,sentiment,tweet_text,CleanText
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...,gas by my house hit 339 iu2019m going to chape...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...,theo walcott is still shitu002c watch rafa and...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...,its not that iu2019m a gsp fanu002c i just hat...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...,iranian general says israelu2019s iron dome ca...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...,tehranu002c mon amour obama tried to establish...


# TOKENIZATION

> Indented block

> Indented block





In [None]:
from nltk.tokenize import TweetTokenizer

tknzr = TweetTokenizer()

# List that contains a list of the tokens of each tweet
tokens = list()
for index, row in dataset.iterrows():
    tweetTokens = tknzr.tokenize(dataset.loc[index,'CleanText'])
    
    # Remove stopwords
    tweetTokens = [w for w in tweetTokens if not w in stopwords.words('english')] 
    
    tokens.append(tweetTokens)
    

# Add tokens as a new column to the dataframe    
dataset['Tokens'] = pd.Series(tokens,index=dataset.index)

dataset.head()

Unnamed: 0,tweet_id,sentiment,tweet_text,CleanText,Tokens
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...,gas by my house hit 339 iu2019m going to chape...,"[gas, house, hit, 339, iu2019m, going, chapel,..."
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...,theo walcott is still shitu002c watch rafa and...,"[theo, walcott, still, shitu, 002c, watch, raf..."
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...,its not that iu2019m a gsp fanu002c i just hat...,"[iu2019m, gsp, fanu, 002c, hate, nick, diaz, c..."
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...,iranian general says israelu2019s iron dome ca...,"[iranian, general, says, israelu, 2019s, iron,..."
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...,tehranu002c mon amour obama tried to establish...,"[tehranu, 002c, mon, amour, obama, tried, esta..."


## LEMMATIZATION

In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# List that contains a list of the lemmas of the tokens of each tweet
lemmasColumn = list()
for index, row in dataset.iterrows():
    lemmas = list()
    for token in dataset.loc[index,'Tokens']:
        lemmas.append(lemmatizer.lemmatize(token))
    lemmasColumn.append(lemmas)

# Replace Tokens column with thier lemmas
dataset.drop(['Tokens'],1,inplace=True)
dataset['Tokens'] = pd.Series(lemmasColumn,index=dataset.index)

dataset.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,tweet_id,sentiment,tweet_text,CleanText,Tokens
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...,gas by my house hit 339 iu2019m going to chape...,"[gas, house, hit, 339, iu2019m, going, chapel,..."
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...,theo walcott is still shitu002c watch rafa and...,"[theo, walcott, still, shitu, 002c, watch, raf..."
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...,its not that iu2019m a gsp fanu002c i just hat...,"[iu2019m, gsp, fanu, 002c, hate, nick, diaz, c..."
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...,iranian general says israelu2019s iron dome ca...,"[iranian, general, say, israelu, 2019s, iron, ..."
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...,tehranu002c mon amour obama tried to establish...,"[tehranu, 002c, mon, amour, obama, tried, esta..."


## VECTORIZING DATA

In [None]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [None]:
n_dataset = remove_unwanted_cols(dataset, ['tweet_id'])
n_dataset.head()

Unnamed: 0,sentiment,tweet_text,CleanText,Tokens
0,positive,Gas by my house hit $3.39!!!! I\u2019m going t...,gas by my house hit 339 iu2019m going to chape...,"[gas, house, hit, 339, iu2019m, going, chapel,..."
1,negative,Theo Walcott is still shit\u002c watch Rafa an...,theo walcott is still shitu002c watch rafa and...,"[theo, walcott, still, shitu, 002c, watch, raf..."
2,negative,its not that I\u2019m a GSP fan\u002c i just h...,its not that iu2019m a gsp fanu002c i just hat...,"[iu2019m, gsp, fanu, 002c, hate, nick, diaz, c..."
3,negative,Iranian general says Israel\u2019s Iron Dome c...,iranian general says israelu2019s iron dome ca...,"[iranian, general, say, israelu, 2019s, iron, ..."
4,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...,tehranu002c mon amour obama tried to establish...,"[tehranu, 002c, mon, amour, obama, tried, esta..."


In [None]:
tf_vector = get_feature_vector(np.array(dataset.iloc[:, 1]).ravel())


In [None]:
X = tf_vector.transform(np.array(dataset.iloc[:, 1]).ravel())
y = np.array(dataset.iloc[:, 0]).ravel()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

## NAIVE BAYES MODEL

In [None]:
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

0.5918937805730259


## LOGISTIC REGRESSION MODEL

In [None]:
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

0.6457023060796646


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## CLEANING TEST DATA

In [None]:
for index, row in test_ds.iterrows():
    test_ds.loc[index,'CleanText'] = cleanTweet(test_ds.loc[index,'tweet_text'])
test_ds.head()

Unnamed: 0,tweet_id,tweet_text,CleanText
0,264238274963451904,"@jjuueellzz down in the Atlantic city, ventnor...",down in the atlantic city ventnor margate oce...
1,218775148495515649,Musical awareness: Great Big Beautiful Tomorro...,musical awareness great big beautiful tomorrow...
2,258965201766998017,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...,on radio786 1004fm 710 fri oct 19 labour analy...
3,262926411352903682,"Kapan sih lo ngebuktiin,jan ngomong doang Susa...",kapan sih lo ngebuktiinjan ngomong doang susah...
4,171874368908050432,"Excuse the connectivity of this live stream, f...",excuse the connectivity of this live stream fr...


## TOKENIZING TEST DATA

In [None]:
tknzr = TweetTokenizer()
tokens = list()
for index, row in test_ds.iterrows():
    tweetTokens = tknzr.tokenize(test_ds.loc[index,'CleanText'])
    tweetTokens = [w for w in tweetTokens if not w in stopwords.words('english')] 
    tokens.append(tweetTokens)
test_ds['Tokens'] = pd.Series(tokens,index=test_ds.index)

test_ds.head()

Unnamed: 0,tweet_id,tweet_text,CleanText,Tokens
0,264238274963451904,"@jjuueellzz down in the Atlantic city, ventnor...",down in the atlantic city ventnor margate oce...,"[atlantic, city, ventnor, margate, ocean, city..."
1,218775148495515649,Musical awareness: Great Big Beautiful Tomorro...,musical awareness great big beautiful tomorrow...,"[musical, awareness, great, big, beautiful, to..."
2,258965201766998017,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...,on radio786 1004fm 710 fri oct 19 labour analy...,"[radio, 786 1004, fm, 710, fri, oct, 19, labou..."
3,262926411352903682,"Kapan sih lo ngebuktiin,jan ngomong doang Susa...",kapan sih lo ngebuktiinjan ngomong doang susah...,"[kapan, sih, lo, ngebuktiinjan, ngomong, doang..."
4,171874368908050432,"Excuse the connectivity of this live stream, f...",excuse the connectivity of this live stream fr...,"[excuse, connectivity, live, stream, baba, amr..."


## LEMMATIZATION OF TEST DATA

In [None]:
lemmatizer = WordNetLemmatizer()
lemmasColumn = list()
for index, row in test_ds.iterrows():
    lemmas = list()
    for token in test_ds.loc[index,'Tokens']:
        lemmas.append(lemmatizer.lemmatize(token))
    lemmasColumn.append(lemmas)
test_ds.drop(['Tokens'],1,inplace=True)
test_ds['Tokens'] = pd.Series(lemmasColumn,index=test_ds.index)

test_ds.head()

Unnamed: 0,tweet_id,tweet_text,CleanText,Tokens
0,264238274963451904,"@jjuueellzz down in the Atlantic city, ventnor...",down in the atlantic city ventnor margate oce...,"[atlantic, city, ventnor, margate, ocean, city..."
1,218775148495515649,Musical awareness: Great Big Beautiful Tomorro...,musical awareness great big beautiful tomorrow...,"[musical, awareness, great, big, beautiful, to..."
2,258965201766998017,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...,on radio786 1004fm 710 fri oct 19 labour analy...,"[radio, 786 1004, fm, 710, fri, oct, 19, labou..."
3,262926411352903682,"Kapan sih lo ngebuktiin,jan ngomong doang Susa...",kapan sih lo ngebuktiinjan ngomong doang susah...,"[kapan, sih, lo, ngebuktiinjan, ngomong, doang..."
4,171874368908050432,"Excuse the connectivity of this live stream, f...",excuse the connectivity of this live stream fr...,"[excuse, connectivity, live, stream, baba, amr..."


# CREATING A DATAFRAME WITH JUST ID'S

In [None]:
idcolumn=test_ds['tweet_id']
idcolumn.head()

0    264238274963451904
1    218775148495515649
2    258965201766998017
3    262926411352903682
4    171874368908050432
Name: tweet_id, dtype: int64

## USING LOGISITC REGRESSION MODEL FOR FINAL RESULT

In [None]:
test_feature = tf_vector.transform(np.array(test_ds.iloc[:, 1]).ravel())
test_prediction_lr = LR_model.predict(test_feature)
tp=test_prediction_lr[:,None]


In [None]:
newdf= pd.DataFrame(tp,columns=['sentiment'])
newdf['tweet_id']=idcolumn

In [None]:
columns_titles = ["tweet_id","sentiment"]
newdf=newdf.reindex(columns=columns_titles)

In [None]:
newdf.head()

Unnamed: 0,tweet_id,sentiment
0,264238274963451904,neutral
1,218775148495515649,positive
2,258965201766998017,neutral
3,262926411352903682,positive
4,171874368908050432,neutral


In [None]:
newdf.to_csv('result1.csv',index=False)