# Project on Sentiment Analysis 


Sentiment analysis remains one of the key problems that has seen extensive application of natural language processing. This time around, given the tweets from customers about various tech firms who manufacture and sell mobiles, computers, laptops, etc, the task is to identify if the tweets have a negative sentiment towards such companies or products.

#### Import necessary libraries

In [None]:
import numpy as np
import pandas as pd

#### importing training and test data

In [None]:
data = pd.read_csv("D:/dataforpython/analytics_vidya_sentiment_analysis/train_2kmZucJ.csv")
test_data = pd.read_csv("D:/dataforpython/analytics_vidya_sentiment_analysis/test_oJQbWVk.csv")

#### Basic EDA

In [None]:
data.head()
data.shape

# checking if there are any NA values
data.isna().sum()

#### Checking if the data is balanced

In [None]:
data["label"].value_counts()

# data is imbalance so we will resample the data
# Handling imbalance data (upscaling data)
from sklearn.utils import resample

df_majority = data[data['label'] == 0]
df_minority = data[data['label'] == 1]

upSample = resample(df_minority, replace=True, n_samples=5894, random_state=0)

data = pd.concat([df_majority, upSample])
data['label'].value_counts()

#### Cleaning the text data from data["string"] column

In [None]:
# importing libraries for cleaning
import nltk
import re
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

# creating object of porterstemmer, WordNetLemmatizer & stopwords
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))


# checking length of stop words
len(stopwords)

# Adding puntuation to set of stopwords
punctuation = list(string.punctuation)
stopwords.update(punctuation)

# checking length after adding puntuation to stopwords
len(stopwords)

# Resetting index as upscaling will give random numbers.
data = data.reset_index(drop=True)

#### Cleaning text

In [None]:
doc = []
for i in range(0,data.shape[0]):
    text = str(data["tweet"][i])
    print(i)
    text = text.lower()
    text = re.sub("[^a-zA-Z]", " ", text)
    text = nltk.word_tokenize(text)
    text = [stemmer.stem(word) for word in text if word not in stopwords and len(word)>2]
    text = " ".join(text)
    print(text)
    doc.append(text)

# checking shape
print(data.shape[0])

# converting list to Pandas dataframe
data["tweet"] = pd.DataFrame(doc)

#### Creating Independent & Target variable

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(ngram_range=(1,2),max_features=15000)
df = cv.fit_transform(doc).toarray()
df.shape

# converting to DataFrame for concatinating (optional)
x_df = pd.DataFrame(df)

# Independent variables
x = pd.concat([data.drop(["tweet","id","label"],axis=1),x_df],axis=1)
x.shape

# Target variable
y = data["label"]

#### Training model with Multinomial NB

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(x,y)

# Performing preprocessing on test data, making copy of the data so that original data dosent get affected
test = test_data.copy()

#### Cleaning text for test data

In [None]:
doc1 = []
for i in range(0,test.shape[0]):
    text = str(test["tweet"][i])
    print(i)
    text = text.lower()
    text = re.sub("[^a-zA-Z]", " ", text)
    text = nltk.word_tokenize(text)
    text = [stemmer.stem(word) for word in text if word not in stopwords and len(word)>2]
    text = " ".join(text)
    print(text)
    doc1.append(text)

# checking shape
print(test.shape[0])

# converting list to Pandas dataframe
test["tweet"] = pd.DataFrame(doc1)

# Creating Independent & Target variable
df1 = cv.transform(doc1).toarray()
df1.shape

# converting to DataFrame for concatinating (optional)
x_df1 = pd.DataFrame(df1)

# Independent variables
x_test = pd.concat([test.drop(["tweet","id"],axis=1),x_df1],axis=1)
x_test.shape

#### predicting test set results

In [None]:
y_pred = classifier.predict(x_test)
prediction = pd.DataFrame(y_pred,np.arange(7921,9874))
prediction.to_csv("D:/dataforpython/analytics_vidya_sentiment_analysis/submit1.csv")