# A beginners guide to sentiment nalysis

### This notebook contains binary classification of of text data by sentiment analysis. The data can be found [here](https://www.kaggle.com/c/nlp-getting-started/data).
### The idea behind this notebook is very simple, does not use fancy and complex models, cleans the data, encodes it in one hot vectors and trains a logistic regression model on it.


## The aim of this notebook is to predict if a tweet is of a fake disaster or a distress signal of a real disaster.

### 0 - Not about a disaster
### 1 - About a disaster

In [None]:
import pandas as pd 

In [None]:
df = pd.read_csv('../input/nlp-getting-started/train.csv')

In [None]:
df.head()

In [None]:
df['target'].value_counts()


### The data is fairly balanced with 4342 examples of a fake disaster and 3271 examples of a real disaster

In [None]:
df.isna().sum()

In [None]:
df.fillna('',inplace=True)

# Removing punctuation

In [None]:
#library that contains punctuation
import string
string.punctuation

In [None]:

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree


In [None]:
df['text']= df['text'].apply(lambda x:remove_punctuation(x))

# Lowering the case

In [None]:
df['text']= df['text'].apply(lambda x: x.lower())

# Tokenizing 

In [None]:

def tokenize(string):
    '''
    Tokenizes the string to a list of words
    '''
    tokens = string.split()
    return tokens


In [None]:
df['text']= df['text'].apply(lambda x: tokenize(x))

In [None]:
df.head()

In [None]:
df['keyword']= df['keyword'].apply(lambda x: tokenize(x))

In [None]:
df.tail()

In [None]:
df.drop(columns=['id'],inplace=True)

In [None]:
df.head()

# Removing stop words

In [None]:
#importing nlp library
import nltk
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

In [None]:
df['text']= df['text'].apply(lambda x:remove_stopwords(x))

# Stemming

In [None]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

In [None]:
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text
df['text']=df['text'].apply(lambda x: stemming(x))

# Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [None]:
nltk.download('wordnet')

In [None]:
# df['text']=df['text'].apply(lambda x: lemmatizer(x))

# Vectorizing the data

### Building the vocabulary
### In another approach, we could have used the vocabulary, but in this approach, we do not use the vocabulary so if you want to reproduce this code, you can safely omit this section

In [None]:
vocab = []

'''
We add all the lists of tokenized strings to make one large list of words

Note ['a','b'] + ['c'] = ['a','b','c']

'''

for i in df['text'].values:
    vocab = vocab + i

print(len(vocab))


In [None]:
# We make a set of the vocab words to remove multiple occurences of a same word, implying only unique words stay in set.

set_vocab = set(vocab)
vocab = list(set_vocab)
# we convert that set back to a list
print(len(vocab),type(vocab))

## Vectorizing the text data

In [None]:
## Converting the tokens back to strings to feed it into Count Vectorizer

df['text_strings'] = df['text'].apply(lambda x: ' '.join([str(elem) for elem in x]))

In [None]:
df['text_strings'].head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()

In [None]:
X = vectorizer.fit_transform(df['text_strings'])

# Obtaining x_train and y_train

In [None]:
x_train = X.toarray()

## The text is now vectorized

In [None]:
import numpy as np

In [None]:
import numpy as nper
x_train = np.array(x_train)

# This x_train can be used directly to train a model

In [None]:
y_train = df['target']

In [None]:
x_train.shape

In [None]:
y_train.shape

# Fitting a model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(random_state=42)

In [None]:
clf.fit(x_train,y_train)

In [None]:
pred = clf.predict(x_train)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_train, pred)

# Testing the model on test set

## Preprocessing the test set

In [None]:
df_test =pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
df_test.fillna('',inplace=True)

In [None]:
df_test.drop(columns=['id','keyword','location'],inplace=True)

In [None]:
df_test['text']= df_test['text'].apply(lambda x:remove_punctuation(x))
df_test['text']= df_test['text'].apply(lambda x: tokenize(x))
df_test['text']= df_test['text'].apply(lambda x:remove_stopwords(x))
df_test['text']= df_test['text'].apply(lambda x: stemming(x))

In [None]:
df_test['text_strings'] = df_test['text'].apply(lambda x: ' '.join([str(elem) for elem in x]))

In [None]:
x_test = vectorizer.transform(df_test['text_strings'])

In [None]:
x_test = x_test.toarray()

In [None]:
x_test = np.array(x_test)

In [None]:
y_test_pred = clf.predict(x_test)

In [None]:
y_test_pred

# Putting the predictions to test csv

In [None]:
import pandas as pd

In [None]:
submission = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
submission['target'] = y_test_pred

In [None]:
submission.head()

In [None]:
final_submission = submission[['id','target']]

In [None]:
final_submission.to_csv('final_submission.csv')

In [None]:
one_string = vectorizer.transform(['it is raining fire'])

In [None]:
clf.predict(one_string)

### This means it is a tweet about a disaster

# Final Comments

### Having created this notebook from scratch, starting afresh in the field of NLP,it would be invaluable to have you comment on this notebook to tell me what could have I done better, what I did right and what must never be done. Do comment ;)