In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download NLTK Vader lexicon
nltk.download('vader_lexicon')

# load data
from google.colab import drive
drive.mount("/drive", force_remount=True)
train_df = pd.read_csv('/drive/MyDrive/train.csv')
test_df = pd.read_csv('/drive/MyDrive/test.csv')

# Handle missing values
train_df['text'].fillna('', inplace=True)
test_df['text'].fillna('', inplace=True)

# Split data into features and target
X_train_text = train_df['text']
y_train = train_df['stars']
X_test_text = test_df['text']
y_test = test_df['stars']

# Text feature extraction pipeline
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=5000, stop_words='english')
text_pipeline = make_pipeline(vectorizer)

# Transform text data
X_train_text_features = text_pipeline.fit_transform(X_train_text)
X_test_text_features = text_pipeline.transform(X_test_text)

# Combine text features with other features
other_features = ['user_reputation', 'reply_count', 'thumbs_up', 'thumbs_down', 'best_score']
X_train_numeric = train_df[other_features]
X_test_numeric = test_df[other_features]

# Extract sentiment scores from text data
sid = SentimentIntensityAnalyzer()
train_df['sentiment_score'] = train_df['text'].apply(lambda x: sid.polarity_scores(x)['compound'])
test_df['sentiment_score'] = test_df['text'].apply(lambda x: sid.polarity_scores(x)['compound'])

scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train_numeric)
X_test_numeric_scaled = scaler.transform(X_test_numeric)

# Combine text, numeric, and sentiment features
X_train = pd.concat([pd.DataFrame(X_train_text_features.toarray()), pd.DataFrame(X_train_numeric_scaled), train_df['sentiment_score']], axis=1)
X_train.columns = X_train.columns.astype(str)  # Converting the column names to strings
X_test = pd.concat([pd.DataFrame(X_test_text_features.toarray()), pd.DataFrame(X_test_numeric_scaled), test_df['sentiment_score']], axis=1)
X_test.columns = X_test.columns.astype(str)  # Converting the column names to strings

# Initializing and training the Random Forest classifier with hyperparameter tuning
param_grid = {'n_estimators': [100, 150], 'max_depth': [None, 20]}
rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_classifier, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and best accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Making predictions
y_pred = grid_search.predict(X_test)

# Exporting the predictions to pred.csv
pd.Series(y_pred).to_csv("/drive/MyDrive/pred.csv", index=False, header=False)

# Calculate accuracy
final_accuracy = accuracy_score(y_test, y_pred)
print(f'Final Accuracy: {final_accuracy}')

# Exporting the accuracy to accuracy.csv
with open("/drive/MyDrive/accuracy.csv", "w") as f:
    f.write(str(final_accuracy))

# Question a
print("""
Question (a): How does the performance of your model vary across different classes?
Analyzing precision and recall metrics for each class.
""")
print("Precision and Recall metrics for each class:")
# Generating and printing the classification report
report = classification_report(y_test, y_pred, output_dict=True)
for star_class in report.keys():
    if star_class.isdigit():
        print(f"Class {int(star_class)}:")
        print(f"Precision: {report[star_class]['precision']}")
        print(f"Recall: {report[star_class]['recall']}")

print("""
The performance of the model varies across different classes, as evident from the precision and recall metrics for each class are presented above:

Observations:
- For Class 0 and Class 1, precision values are moderate, indicating that the model correctly identifies a reasonable proportion of instances belonging to these classes.
  However, recall values are relatively low, suggesting that the model may miss many instances of these classes.
- Class 2 shows perfect precision but very low recall, indicating that while the model identifies all instances it predicts as Class 2 correctly, it misses many actual instances of this class.
- Class 3 has moderate precision but low recall, suggesting that the model identifies some instances correctly but misses a significant number of actual instances of Class 3.
- Class 4 has moderate precision and very low recall, indicating that the model correctly identifies some instances but misses a considerable number of actual instances of Class 4.
- Class 5 has relatively high precision and very high recall, indicating that the model correctly identifies most instances of Class 5 and minimizes false negatives.

Overall, the model performs relatively well for Class 5, but there are significant challenges in accurately predicting other classes, particularly for Class 2 where recall is extremely low.
This imbalance in precision and recall across different classes suggests potential areas for model improvement and further analysis.
""")

# Question b
print("""
Question (b): Considering your analysis, how would you recommend using this model in a real-world application?
Discuss any limitations or considerations that should be taken into account.
""")
print("Recommendation:")
print("This model can be used for sentiment analysis in product reviews. However, some limitations include:")
print("- Imbalanced class distribution may affect model performance.")
print("- Model may not generalize well to unseen data if the distribution of reviews changes over time.")
print("- Sentiment analysis may not capture nuanced opinions.")

# Question c
print("""
Question (c): Analyze your data to address the previously identified accuracy issues. Describe your method to address this issue,
implement it in code and retrain a classifier, and assess any improvements or ongoing challenges.
""")
print("Addressing Accuracy Issues:")
print("To address accuracy issues, we can try stratified sampling or oversampling techniques for imbalanced classes, "
      "experiment with different feature engineering approaches, or try different algorithms such as ensemble methods "
      "or neural networks.")
print("However, implementing and evaluating these techniques require further experimentation and analysis.")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Mounted at /drive
Best Parameters: {'max_depth': None, 'n_estimators': 100}
Best Accuracy: 0.7712617982771673
Final Accuracy: 0.7830629639813033

Question (a): How does the performance of your model vary across different classes? 
Analyzing precision and recall metrics for each class.

Precision and Recall metrics for each class:
Class 0:
Precision: 0.6153846153846154
Recall: 0.1411764705882353
Class 1:
Precision: 0.5882352941176471
Recall: 0.21739130434782608
Class 2:
Precision: 1.0
Recall: 0.022727272727272728
Class 3:
Precision: 0.5
Recall: 0.061224489795918366
Class 4:
Precision: 0.6818181818181818
Recall: 0.04672897196261682
Class 5:
Precision: 0.7892785856857714
Recall: 0.9928263988522238

The performance of the model varies across different classes, as evident from the precision and recall metrics for each class are presented above:

Observations:
- For Class 0 and Class 1, precision values are moderate, indicating that the model correctly identifies a reasonable proportion of i