In [70]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
import warnings
warnings.filterwarnings("ignore")

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load Dataset
df = pd.read_csv('/content/drive/MyDrive/Internship /dataset.csv')  # Change path if needed

# Strip leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# Drop irrelevant columns
df = df.drop(columns=[
    'Timestamp',
    'Name :',
    'Unnamed: 8',
    'Who is your favorite author or book ?'
])

# Rename columns
df.columns = [
    "Reading_Frequency",
    "Book_Length",
    "Mood",
    "Interested_Genres",
    "Preferred_Genre"
]


df = df[df["Preferred_Genre"].notnull()]

# Expand Interested_Genres into binary features
genres = ['Fiction', 'Sci-Fi', 'Self-help', 'Biography', 'Thriller', 'Fantasy']
for genre in genres:
    df[genre] = df["Interested_Genres"].str.contains(genre).astype(int)

df.drop("Interested_Genres", axis=1, inplace=True)


le_dict = {}
for col in ["Reading_Frequency", "Book_Length", "Mood", "Preferred_Genre"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le


X = df.drop("Preferred_Genre", axis=1)
y = df["Preferred_Genre"]

np.random.seed(42)
y_semi = y.copy()
mask = np.random.rand(len(y)) < 0.18
y_semi[mask] = -1


comparison_df = pd.DataFrame({"Original_Label": y, "Masked_Label": y_semi})
print("🔍 First 15 rows (Original vs Masked):")
print(comparison_df.head(15))
print(f"\n Labeled samples: {sum(y_semi != -1)}")
print(f" Unlabeled samples: {sum(y_semi == -1)}")


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


base_model = RandomForestClassifier(n_estimators=100, random_state=42)
model = SelfTrainingClassifier(base_model, criterion='k_best', k_best=20)
model.fit(X_scaled, y_semi)

y_pred = model.predict(X_scaled)
acc = accuracy_score(y, y_pred)
print(f"\n Accuracy: {acc*100:.2f}%")
print("\n Classification Report:\n", classification_report(y, y_pred))


with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("encoders.pkl", "wb") as f:
    pickle.dump(le_dict, f)

print("\n Files saved: model.pkl, scaler.pkl, encoders.pkl")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🔍 First 15 rows (Original vs Masked):
    Original_Label  Masked_Label
0                2             2
1                0             0
2                1             1
3                4             4
4                4            -1
5                4            -1
6                4            -1
7                4             4
8                3             3
9                2             2
10               5            -1
11               0             0
12               5             5
13               5             5
14               5             5

 Labeled samples: 31
 Unlabeled samples: 9

 Accuracy: 82.50%

 Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.89      0.73         9
           1       1.00      1.00      1.00         3
           2       0.71      1.00      0.83         5