<a href="https://colab.research.google.com/github/sahithyagunda/myprojects/blob/main/Sentiment_analysis_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
jp797498e_twitter_entity_sentiment_analysis_path = kagglehub.dataset_download('jp797498e/twitter-entity-sentiment-analysis')

print('Data source import complete.')


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing Dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import warnings
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


### creating Instances

mnb = MultinomialNB()
rfc = RandomForestClassifier()
ps = PorterStemmer()
encoder = LabelEncoder()
cv = CountVectorizer()
mnb = MultinomialNB()
rfc = RandomForestClassifier()
warnings.filterwarnings('ignore')

# Loading Datasets

In [None]:
train_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv')
val_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv')

**Adding column names**

In [None]:
train_df.columns = ['id','Entity','Sentiment','Tweet']
val_df.columns = ['id','Entity','Sentiment','Tweet']

In [None]:
train_df['Entity'].unique()

In [None]:
print(train_df['Sentiment'].unique())
print(val_df['Sentiment'].unique())

# Concatinating training and validation datasets

In [None]:
data = pd.concat([train_df,val_df],ignore_index = True)

In [None]:
data.shape

In [None]:
data = data.dropna() ## drpping null values

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum() ### checking for duplicates

In [None]:
data = data.drop_duplicates(keep = 'first') ## dropping duplicates

In [None]:
data.duplicated().sum()

In [None]:
data.info() ## checking type of attributes

**As id is an identifier which is used to classify each item uniquely in these analysis we have index we dont want these column for now and Entity represents the keywords in tweet these might not an important column**

**Dropping ID and Entity**

In [None]:
data = data.drop(columns = ['id','Entity'])

In [None]:
target_counts = data['Sentiment'].value_counts()

In [None]:
plt.figure(figsize = (10,6))
sns.countplot(x = 'Sentiment',data = data)
plt.title('Distribution of Sentiments')
plt.xlabel ('Sentiments')
plt.ylabel('Frequency')
plt.show()

In [None]:
from nltk.corpus import stopwords                   #### importing stopwords
stop_wrds =  stopwords.words('english')             ### creating instances for stopwords and punctuations
punctuations = string.punctuation

In [None]:
data.head(3)

# Defining a Function that Transforms Text

In [None]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []

    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stop_wrds and i not in punctuations:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [None]:
data['cleaned_tweet'] = data['Tweet'].apply(transform_text)

# Function to remove emojis

In [None]:
import re
def remove_emojis(text):
    if not isinstance(text, str):
        return text
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"  # other emojis
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

In [None]:
data['cleaned_tweet']=data['cleaned_tweet'].apply(remove_emojis)

In [None]:
data = data.drop(columns = ['Tweet'])   ### dropping original column of tweet

# Label Encoding

In [None]:
data['Sentiment'] = encoder.fit_transform(data['Sentiment'])

In [None]:
data['Sentiment'].value_counts() ### 1 - Negative
                                 ### 2 - Neutral
                                 ### 3 - Positive
                                 ### 4 - Irrelevant

# Visualizing the most common words using the WordCloud

In [None]:
wc = WordCloud(width = 500,height = 500,background_color = 'white')

In [None]:
neg_wc = wc.generate(data[data['Sentiment']==1]['cleaned_tweet'].str.cat(sep = " "))
plt.imshow(neg_wc)

In [None]:
neu_wc = wc.generate(data[data['Sentiment']==2]['cleaned_tweet'].str.cat(sep = " "))
plt.imshow(neu_wc)

In [None]:
pos_wc = wc.generate(data[data['Sentiment']==3]['cleaned_tweet'].str.cat(sep = " "))
plt.imshow(pos_wc)

In [None]:
irr_wc = wc.generate(data[data['Sentiment']==0]['cleaned_tweet'].str.cat(sep=" "))
plt.imshow(irr_wc)

# creating corpus to each sentiment

In [None]:
neg_corpus = []
for i in data[data['Sentiment']==1]['cleaned_tweet'].tolist():
    for words in i.split():
        neg_corpus.append(words)
print(len(neg_corpus))

neu_corpus = []
for i in data[data['Sentiment']==2]['cleaned_tweet'].tolist():
    for words in i.split():
        neu_corpus.append(words)
print(len(neu_corpus))

pos_corpus = []
for i in data[data['Sentiment']==3]['cleaned_tweet'].tolist():
    for words in i.split():
        pos_corpus.append(words)
print(len(pos_corpus))

irr_corpus = []
for i in data[data['Sentiment']==0]['cleaned_tweet'].tolist():
    for words in i.split():
        irr_corpus.append(words)
len(irr_corpus)


# Visualizing the most commonly occured words

In [None]:
negative_word_count = Counter(neg_corpus).most_common(30)
negative_word_table = pd.DataFrame(negative_word_count,columns = ['word','count'])
plt.figure(figsize = (10,6))
sns.barplot(x = 'word',y='count',data = negative_word_table)
plt.title("Counter of Negative corpus plot")
plt.xlabel('words')
plt.ylabel('count')
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
neutral_word_count = Counter(neu_corpus).most_common(30)
neutral_word_table = pd.DataFrame(neutral_word_count,columns = ['word','count'])
plt.figure(figsize = (10,6))
sns.barplot(x = 'word',y = 'count',data =neutral_word_table )
plt.title("counter of neutral word")
plt.xlabel('words')
plt.ylabel('count')
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
positive_word_count = Counter(pos_corpus).most_common(30)
postive_word_table = pd.DataFrame(positive_word_count,columns = ['word','count'])
plt.figure(figsize = (10,6))
sns.barplot(x = 'word',y = 'count',data =postive_word_table )
plt.title('counter of positive words')
plt.xlabel('words')
plt.ylabel('count')
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
irrelevant_word_count = Counter(irr_corpus).most_common(30)
irrelevant_word_table = pd.DataFrame(irrelevant_word_count,columns = ['word','count'])
plt.figure(figsize = (10,6))
sns.barplot(x = 'word',y = 'count',data = irrelevant_word_table )
plt.xlabel('words')
plt.ylabel('count')
plt.xticks(rotation = 'vertical')
plt.show()

**Transforming the tweets using countvectorizer and converting it to array**

In [None]:
X = cv.fit_transform(data['cleaned_tweet']).toarray()

In [None]:
X.shape

In [None]:
y = data['Sentiment'].values

In [None]:
y

**splitting the data into train an test datasets**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 24,test_size = 0.3)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
mnb.fit(X_train,y_train)
y_pred = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

In [None]:
rfc.fit(X_train,y_train)
y_pred = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

# **Try to run these cells**

In [None]:
clfs = {'SVC' : svc,
    'KN' : knc,
    'MNB': mnb,
     'BNB':bnb,
     'GNB':gnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'AdaBoost': abc,
    'BgC': bc,
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb}

In [None]:
def train_classifier(clf,X_train,X_test,y_train,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    return accuracy,precision

In [None]:
train_classifier(svc,X_train,X_test,y_train,y_test)