In [52]:
import warnings
import pandas as pd
import numpy as np
from textblob import TextBlob
import contractions
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
warnings.filterwarnings("ignore")
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

In [53]:
df = pd.read_csv('Train.csv')

In [54]:
df

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2339,I charge it at night and skip taking the cord ...,cord,neutral,41,45
1,2339,I charge it at night and skip taking the cord ...,battery life,positive,74,86
2,1316,The tech guy then said the service center does...,service center,negative,27,41
3,1316,The tech guy then said the service center does...,"""sales"" team",negative,109,121
4,1316,The tech guy then said the service center does...,tech guy,neutral,4,12
...,...,...,...,...,...,...
2353,2272,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,neutral,104,134
2354,2272,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,neutral,140,170
2355,848,"How Toshiba handles the repair seems to vary, ...",repair,conflict,24,30
2356,848,"How Toshiba handles the repair seems to vary, ...",repair,positive,130,136


In [55]:
df = df.drop(columns=['id'])

In [56]:
df = df.drop(columns=['from', 'to'])

In [57]:
df.head(5)

Unnamed: 0,Sentence,Aspect Term,polarity
0,I charge it at night and skip taking the cord ...,cord,neutral
1,I charge it at night and skip taking the cord ...,battery life,positive
2,The tech guy then said the service center does...,service center,negative
3,The tech guy then said the service center does...,"""sales"" team",negative
4,The tech guy then said the service center does...,tech guy,neutral


In [58]:
df['polarity'].nunique()

4

In [59]:
df['polarity'].value_counts()

polarity
positive    987
negative    866
neutral     460
conflict     45
Name: count, dtype: int64

In [60]:
df['Sentence'] = df['Sentence'].str.lower()
df['Aspect Term'] = df['Aspect Term'].str.lower()

In [61]:
url_pattern = r'http[s]?://[^\s]+'
urls_found = df[df['Sentence'].str.contains(url_pattern, regex=True)]

print("Sentences with URLs:")
print(urls_found[['Sentence']])

Sentences with URLs:
Empty DataFrame
Columns: [Sentence]
Index: []


In [62]:
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails_found = df[df['Sentence'].str.contains(email_pattern, regex=True)]

print("\nSentences with email addresses:")
print(emails_found[['Sentence']])


Sentences with email addresses:
Empty DataFrame
Columns: [Sentence]
Index: []


In [63]:
hashtag_pattern = r'#[\w]+'
hashtags_found = df[df['Sentence'].str.contains(hashtag_pattern, regex=True)]

print("\nSentences with hashtags:")
print(hashtags_found[['Sentence']])


Sentences with hashtags:
Empty DataFrame
Columns: [Sentence]
Index: []


In [64]:
df['Sentence'] = df['Sentence'].str.strip().str.replace('\s+', ' ', regex=True)
df['Aspect Term'] = df['Aspect Term'].str.strip().str.replace('\s+', ' ', regex=True)

In [65]:
df['Sentence'] = df['Sentence'].apply(contractions.fix)
df['Aspect Term'] = df['Aspect Term'].apply(contractions.fix)

In [66]:
def remove_special_characters(text):
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

df['Sentence'] = df['Sentence'].apply(remove_special_characters)
df['Aspect Term'] = df['Aspect Term'].apply(remove_special_characters)

In [67]:
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

df['Sentence'] = df['Sentence'].apply(remove_numbers)
df['Aspect Term'] = df['Aspect Term'].apply(remove_numbers)

In [68]:
df['Sentence'] = df['Sentence'].str.strip().str.replace('\s+', ' ', regex=True)
df['Aspect Term'] = df['Aspect Term'].str.strip().str.replace('\s+', ' ', regex=True)

In [69]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [70]:
def preprocess(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    pos_tags = pos_tag(filtered_tokens)
    #ps = PorterStemmer()
    #stemmed_tokens = [ps.stem(word) for word in filtered_tokens]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return ' '.join(lemmatized_tokens)

In [71]:
df['Sentence'] = df['Sentence'].apply(preprocess)
df['Aspect Term'] = df['Aspect Term'].apply(preprocess)

In [72]:
df['Sentence']

0           charge night skip take cord good battery life
1           charge night skip take cord good battery life
2       tech guy say service center exchange direct co...
3       tech guy say service center exchange direct co...
4       tech guy say service center exchange direct co...
                              ...                        
2353    also use paralles run virtual machine window x...
2354    also use paralles run virtual machine window x...
2355    toshiba handle repair seem vary folk indicate ...
2356    toshiba handle repair seem vary folk indicate ...
2357    would like use different operate system altoge...
Name: Sentence, Length: 2358, dtype: object

In [73]:
df['Combined'] = df['Sentence'] + " " + df['Aspect Term']

In [74]:
X = df['Combined']
y = df['polarity']

In [24]:
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)

In [25]:
le = LabelEncoder()
y = le.fit_transform(y)

In [26]:
chi2_selector_bow = SelectKBest(chi2, k=100)
X_kbest_bow = chi2_selector_bow.fit_transform(X_bow, y)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_kbest_bow, y, test_size=0.2, random_state=42)

In [28]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=5805)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {accuracy}')

Random Forest Accuracy: 0.6186440677966102


In [29]:
svc_classifier = SVC(kernel='rbf', random_state=5805)
svc_classifier.fit(X_train, y_train)

y_pred = svc_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'SVC Accuracy: {accuracy}')

SVC Accuracy: 0.625


In [30]:
decison_tree_classifier = DecisionTreeClassifier(criterion='entropy', random_state=5805)
decison_tree_classifier.fit(X_train, y_train)

y_pred = decison_tree_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Decison Tree Classifier Accuracy: {accuracy}')

Decison Tree Classifier Accuracy: 0.6186440677966102


In [31]:
def evaluate_classifier(X, y, classifier):
    scores = cross_val_score(classifier, X, y, cv=10, scoring='accuracy')
    return scores.mean()

classifiers = {
    "Random Forest": RandomForestClassifier(random_state=5805),
    "SVM": SVC(random_state=5805),
    "Decision Tree": DecisionTreeClassifier(random_state=5805)
}

In [32]:
print("\nEvaluating with Bag of Words:")
for name, clf in classifiers.items():
    accuracy = evaluate_classifier(X_kbest_bow, y, clf)
    print(f"{name}: Accuracy = {accuracy:.4f}")


Evaluating with Bag of Words:
Random Forest: Accuracy = 0.5954
SVM: Accuracy = 0.6043
Decision Tree: Accuracy = 0.5853


In [33]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [34]:
chi2_selector_tfidf = SelectKBest(chi2, k=100)
X_kbest_tfidf = chi2_selector_tfidf.fit_transform(X_tfidf, y)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_kbest_tfidf, y, test_size=0.2, random_state=42)

In [36]:
rf_classifier = RandomForestClassifier(n_estimators=100,random_state=5805)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {accuracy}')

Random Forest Accuracy: 0.6080508474576272


In [37]:
svc_classifier = SVC(kernel='rbf',random_state=5805)
svc_classifier.fit(X_train, y_train)

y_pred = svc_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'SVC Accuracy: {accuracy}')

SVC Accuracy: 0.614406779661017


In [38]:
decison_tree_classifier = DecisionTreeClassifier(criterion='entropy',random_state=5805)
decison_tree_classifier.fit(X_train, y_train)

y_pred = decison_tree_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Decison Tree Classifier Accuracy: {accuracy}')

Decison Tree Classifier Accuracy: 0.6016949152542372


In [39]:
print("Evaluating with TF-IDF:")
for name, clf in classifiers.items():
    accuracy = evaluate_classifier(X_kbest_tfidf, y, clf)
    print(f"{name}: Accuracy = {accuracy:.4f}")

Evaluating with TF-IDF:
Random Forest: Accuracy = 0.5683
SVM: Accuracy = 0.5946
Decision Tree: Accuracy = 0.5564


In [40]:
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings('glove.6B/glove.6B.300d.txt')

In [41]:
def get_embedding(text, embeddings):
    words = text.split()
    valid_words = [embeddings[word] for word in words if word in embeddings]
    if valid_words:
        return np.mean(valid_words, axis=0)  
    else:
        return np.zeros(300)

df['SentenceEmbedding'] = df['Sentence'].apply(lambda x: get_embedding(x, glove_embeddings))
df['AspectEmbedding'] = df['Aspect Term'].apply(lambda x: get_embedding(x, glove_embeddings))

In [42]:
def combine_embeddings(row):
    return np.concatenate((row['SentenceEmbedding'], row['AspectEmbedding']))

df['CombinedEmbedding'] = df.apply(combine_embeddings, axis=1)

In [43]:
X = np.array(df['CombinedEmbedding'].tolist())

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=5805)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {accuracy}')

Random Forest Accuracy: 0.6864406779661016


In [46]:
svc_classifier = SVC(kernel='rbf',random_state=5805)
svc_classifier.fit(X_train, y_train)

y_pred = svc_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'SVC Accuracy: {accuracy}')

SVC Accuracy: 0.5699152542372882


In [47]:
decison_tree_classifier = DecisionTreeClassifier(criterion='entropy',random_state=5805)
decison_tree_classifier.fit(X_train, y_train)

y_pred = decison_tree_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Decison Tree Classifier Accuracy: {accuracy}')

Decison Tree Classifier Accuracy: 0.5466101694915254


In [None]:
print("\nEvaluating Glove:")
for name, clf in classifiers.items():
    accuracy = evaluate_classifier(X, y, clf)
    print(f"{name}: Accuracy = {accuracy:.4f}")


Evaluating Glove:
Random Forest: Accuracy = 0.6209
SVM: Accuracy = 0.6005


In [75]:
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

In [76]:
df['SentenceEmbedding'] = df['Sentence'].apply(lambda x: model.encode(x))

In [77]:
df['SentenceEmbedding']

0       [0.01144191, 0.07153592, 0.0031493006, 0.01125...
1       [0.01144191, 0.07153592, 0.0031493006, 0.01125...
2       [-0.023885034, -0.019411264, 0.022051396, 0.01...
3       [-0.023885034, -0.019411264, 0.022051396, 0.01...
4       [-0.023885034, -0.019411264, 0.022051396, 0.01...
                              ...                        
2353    [-0.021907007, 0.032787673, -0.0038888517, -0....
2354    [-0.021907007, 0.032787673, -0.0038888517, -0....
2355    [-0.019594712, 0.08396906, 0.00035555853, 0.02...
2356    [-0.019594712, 0.08396906, 0.00035555853, 0.02...
2357    [-0.018998798, 0.078129694, 0.008430354, 0.012...
Name: SentenceEmbedding, Length: 2358, dtype: object

In [78]:
df['AspectEmbedding'] = df['Aspect Term'].apply(lambda x: model.encode(x))

In [79]:
def combine_embeddings(row):
    return np.concatenate((row['SentenceEmbedding'], row['AspectEmbedding']))

df['CombinedEmbedding'] = df.apply(combine_embeddings, axis=1)

In [80]:
X = np.array(df['CombinedEmbedding'].tolist())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

array([[ 0.01144191,  0.07153592,  0.0031493 , ..., -0.02138014,
        -0.02532611, -0.0213773 ],
       [ 0.01144191,  0.07153592,  0.0031493 , ...,  0.0069995 ,
         0.00891787, -0.00099972],
       [-0.02388503, -0.01941126,  0.0220514 , ...,  0.00919146,
        -0.00693951, -0.01500397],
       ...,
       [-0.01959471,  0.08396906,  0.00035556, ..., -0.02487313,
         0.02932738,  0.01469556],
       [-0.01959471,  0.08396906,  0.00035556, ..., -0.02487313,
         0.02932738,  0.01469556],
       [-0.0189988 ,  0.07812969,  0.00843035, ...,  0.03455833,
        -0.0046375 , -0.02602183]], dtype=float32)

In [67]:
rf_classifier = RandomForestClassifier(n_estimators=100,random_state=5803)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {accuracy}')

Random Forest Accuracy: 0.6949152542372882


In [68]:
svc_classifier = SVC(kernel='rbf',random_state=5805)
svc_classifier.fit(X_train, y_train)

y_pred = svc_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'SVC Accuracy: {accuracy}')

SVC Accuracy: 0.6758474576271186


In [69]:
decison_tree_classifier = DecisionTreeClassifier(criterion='entropy',random_state=5805)
decison_tree_classifier.fit(X_train, y_train)

y_pred = decison_tree_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Decison Tree Classifier Accuracy: {accuracy}')

Decison Tree Classifier Accuracy: 0.597457627118644


In [70]:
print("\nEvaluating with Sentence-Transformer:")
for name, clf in classifiers.items():
    accuracy = evaluate_classifier(X, y, clf)
    print(f"{name}: Accuracy = {accuracy:.4f}")


Evaluating with Sentence-Transformer:
Random Forest: Accuracy = 0.6425
SVM: Accuracy = 0.6680
Decision Tree: Accuracy = 0.4860
