<a href="https://colab.research.google.com/github/spgauthaman/Gauthaman_assignment/blob/main/Gauthaman_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# 1. Load Data
train_df = pd.read_csv('/content/drive/MyDrive/Mentorship/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Mentorship/test.csv')

# 2. Setup Preprocessing Tools
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isna(text): return "neutral"
    # Lowercase and remove punctuation/numbers
    text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
    # Tokenize, remove stopwords, and lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return " ".join(tokens) if len(tokens) > 0 else "neutral"

# 3. Apply Cleaning
train_df['cleaned_review'] = train_df['Review Text'].apply(clean_text)
test_df['cleaned_review'] = test_df['Review Text'].apply(clean_text)

# 4. Handle Metadata (App Version Code)
train_df['App Version Code'] = train_df['App Version Code'].fillna(0).astype(int)
test_df['App Version Code'] = test_df['App Version Code'].fillna(0).astype(int)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
# Initialize TF-IDF
tfidf_vec = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)

# Vectorize Text
X_tfidf_train = tfidf_vec.fit_transform(train_df['cleaned_review'])
X_tfidf_test = tfidf_vec.transform(test_df['cleaned_review'])

# Combine TF-IDF with App Version Code
X_final_train = hstack([X_tfidf_train, train_df[['App Version Code']].values])
X_final_test = hstack([X_tfidf_test, test_df[['App Version Code']].values])

# Target Variable (ensure the column name matches your CSV, e.g., 'Star Rating')
# Replace 'Star Rating' with the actual column name for the 1-5 integer
y_train = train_df['Star Rating']

In [12]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Setup Model and Parameters
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

# 5-Fold Stratified Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_final_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Accuracy: {grid_search.best_score_:.4f}")

# Detailed Validation Report
best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_final_train)
print("\nInternal Validation Report:")
print(classification_report(y_train, y_train_pred))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best CV Accuracy: 0.7007

Internal Validation Report:
              precision    recall  f1-score   support

           1       1.00      0.99      0.99      1788
           2       0.81      0.97      0.88       154
           3       0.63      0.96      0.76       217
           4       0.78      0.88      0.83       611
           5       0.97      0.90      0.94      2923

    accuracy                           0.93      5693
   macro avg       0.84      0.94      0.88      5693
weighted avg       0.94      0.93      0.94      5693



In [14]:
# Predict on Test Data
test_predictions = best_model.predict(X_final_test)

# Create Submission DataFrame
submission = pd.DataFrame({
    'id': test_df['id'],
    'Predicted_Rating': test_predictions
})

# Save to CSV
submission.to_csv('predictions.csv', index=False)
print("Final predictions saved to predictions.csv")

Final predictions saved to predictions.csv
