In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords

In [3]:
col_name = ["no", "name", "result", "review"]
df1 = pd.read_csv("twitter_validation.csv", names=col_name)
df2 = pd.read_csv("twitter_training.csv", names=col_name)
df = pd.concat([df1, df2])

In [4]:
df.isnull().sum()

no          0
name        0
result      0
review    686
dtype: int64

In [5]:
df=df.dropna()

In [6]:
df.isnull().sum()

no        0
name      0
result    0
review    0
dtype: int64

In [7]:
df = df.drop("no", axis=1)
df = df[~df['result'].isin(['Neutral', 'Irrelevant'])]

print(df.head())

                   name    result  \
2             Microsoft  Negative   
3                 CS-GO  Negative   
5                  FIFA  Negative   
6             MaddenNFL  Positive   
7  TomClancysRainbowSix  Positive   

                                              review  
2  @Microsoft Why do I pay for WORD when it funct...  
3  CSGO matchmaking is so full of closet hacking,...  
5  Hi @EAHelp I’ve had Madeleine McCann in my cel...  
6  Thank you @EAMaddenNFL!! \n\nNew TE Austin Hoo...  
7  Rocket League, Sea of Thieves or Rainbow Six: ...  


In [8]:
mapping = {"Positive": 1, "Negative": 0}
df["result"] = df["result"].map(mapping)

In [9]:
def clean_html(text):
    clean = re.compile("<.*?>")
    return re.sub(clean, "", text)

df["review"] = df["review"].apply(clean_html)

In [10]:
def convert_lower(text):
    return text.lower()

df["review"] = df["review"].apply(convert_lower)

In [11]:
def remove_special(text):
    clean_text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    return clean_text

df["review"] = df["review"].apply(remove_special)

In [12]:
nltk.download('stopwords')

def remove_stopwords(text):
    stopwords_list = stopwords.words("english")
    tokens = text.split()
    clean_tokens = [token for token in tokens if token not in stopwords_list]
    return " ".join(clean_tokens)

df["review"] = df["review"].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to C:\Users\priyam
[nltk_data]     jain\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [13]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()

def stem_words(text):
    tokens = text.split()
    stemmed_tokens = [porter.stem(token) for token in tokens]
    return " ".join(stemmed_tokens)

df["review"] = df["review"].apply(stem_words)

In [14]:
df["new_review"] = df["name"] + "-" + df["review"]

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df["new_review"], df["result"], test_size=0.2, random_state=42)


In [16]:
cv = CountVectorizer()

X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier()

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

In [20]:
y_pred

array([1, 1, 1, ..., 0, 1, 0], dtype=int64)

In [19]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9676308539944903
