# Libraries

In [None]:
import pickle
import pandas as pd
import seaborn as sns
from os.path import exists
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
nltk.download('punkt_tab')
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk.tokenize import word_tokenize
nltk.download('vader_lexicon')
nltk.download('punkt')

# Loading the Files

Download the csv files into the `data/` directory.

In [None]:
trainingSet = pd.read_csv("./data/train.csv")
testingSet = pd.read_csv("./data/test.csv")

print("train.csv shape is ", trainingSet.shape)
print("test.csv shape is ", testingSet.shape)

print()

print(trainingSet.head())
print()
print(testingSet.head())

print()

print(trainingSet.describe())

trainingSet['Score'].value_counts().plot(kind='bar', legend=True, alpha=.5)
plt.show()

print()
print("EVERYTHING IS PROPERLY SET UP! YOU ARE READY TO START")

# Adding Features

In [None]:
sentiment_analyzer = SentimentIntensityAnalyzer()
def get_sentiment_features(text):
        if pd.isna(text):
            text = ""
        tokens = word_tokenize(text)
        features = sentiment_analyzer.polarity_scores(" ".join(tokens))
        return pd.Series(features)

def add_features_to(df):
    # This is where you can do all your feature extraction
    df = df.copy()

    df['HelpfulnessNumerator'] = df['HelpfulnessNumerator'].fillna(0)
    df['HelpfulnessDenominator'] = df['HelpfulnessDenominator'].fillna(1)
    df['Text'] = df['Text'].fillna("")

    df['Helpfulness'] = df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']
    df['Helpfulness'] = df['Helpfulness'].fillna(0)

    user_avg_score = df.groupby('UserId')['Score'].transform('mean')
    df['user_avg_score'] = user_avg_score.fillna(0)

    sentiment_features = df['Text'].apply(get_sentiment_features)
    df.loc[:, ['neg', 'neu', 'pos', 'compound']] = sentiment_features
    print("Sentiment features added to DataFrame:", df[['neg', 'neu', 'pos', 'compound']].head())
    
    return df

# Load the feature extracted files if they've already been generated
if exists('./data/X_train.csv'):
    X_train = pd.read_csv("./data/X_train.csv")
    
if exists('./data/X_submission.csv'):
    X_submission = pd.read_csv("./data/X_submission.csv")
    

else:
    # Process the DataFrame
    train = add_features_to(trainingSet)

    # Merge on Id so that the submission set can have feature columns as well
    X_submission = pd.merge(train, testingSet, left_on='Id', right_on='Id')
    X_submission = X_submission.drop(columns=['Score_x'])
    X_submission = X_submission.rename(columns={'Score_y': 'Score'})

    # The training set is where the score is not null
    X_train =  train[train['Score'].notnull()]

    X_submission.to_csv("./data/X_submission1.csv", index=False)
    X_train.to_csv("./data/X_train1.csv", index=False)

print(X_train.head())

# Sample + Split into training and testing set

In [None]:
# Split training set into training and testing set
X_train, X_test, Y_train, Y_test = train_test_split(
    X_train.drop(columns=['Score']),
    X_train['Score'],
    test_size=1/4.0,
    random_state=0
)

# Feature Selection

In [None]:
features = ['HelpfulnessNumerator', 'Helpfulness', 'HelpfulnessDenominator', 'user_avg_score', 'neg', 'pos', 'neu', 'compound']

X_train_select = X_train[features]
X_test_select = X_test[features]
X_submission_select = X_submission[features]

# Model Creation

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMClassifier

In [None]:
params = {
    'boosting_type': 'gbdt',       
    'objective': 'multiclass',       
    'num_class': 5,                   
    'metric': 'multi_logloss',        
    'num_leaves': 31,                 
    'learning_rate': 0.1,             
    'feature_fraction': 0.8,         
    'bagging_fraction': 0.8,          
    'bagging_freq': 5,                
    'max_depth': -1                   
}


model = LGBMClassifier(**params) 
model.fit(X_train_select, Y_train)


Y_test_predictions = model.predict(X_test_select)

# Model Evaluation

In [None]:
# Evaluate your model on the testing set
print("Accuracy on testing set = ", accuracy_score(Y_test, Y_test_predictions))

# Plot a confusion matrix
cm = confusion_matrix(Y_test, Y_test_predictions, normalize='true')
sns.heatmap(cm, annot=True)
plt.title('Confusion matrix of the classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Create submission file

In [None]:
# Create the submission file
X_submission['Score'] = model.predict(X_submission_select)
submission = X_submission[['Id', 'Score']]
submission.to_csv("./data/submission.csv", index=False)