In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
# Load dataset
data = pd.read_csv('Datasets/love_compatibility_dataset.csv')

# Preview the dataset
data.head()


Unnamed: 0,Boy_Name,Girl_Name,Boy_Age,Girl_Age,Boy_FavColor,Girl_FavColor,Boy_Hobby,Girl_Hobby,Boy_FavMovieGenre,Girl_FavMovieGenre,Boy_Zodiac,Girl_Zodiac,Compatibility_Score
0,Richard,Kimberly,37,23,Brown,Purple,Drawing,Gaming,Adventure,Romance,Taurus,Aries,82
1,Joseph,Kimberly,19,24,Yellow,Green,Gaming,Drawing,Fantasy,Comedy,Leo,Virgo,100
2,Richard,Linda,40,40,Black,Brown,Drawing,Hiking,Horror,Fantasy,Scorpio,Pisces,100
3,Richard,Laura,26,39,Black,Black,Gaming,Gaming,Adventure,Romance,Aries,Pisces,94
4,Richard,Nancy,37,34,Blue,Blue,Fishing,Drawing,Adventure,Thriller,Scorpio,Scorpio,104


In [3]:
# Convert categorical features to numerical using one-hot encoding
data_encoded = pd.get_dummies(data, columns=[
    'Boy_FavColor', 'Girl_FavColor', 
    'Boy_Hobby', 'Girl_Hobby', 
    'Boy_FavMovieGenre', 'Girl_FavMovieGenre', 
    'Boy_Zodiac', 'Girl_Zodiac'
])

# Check encoded data
data_encoded.head()

Unnamed: 0,Boy_Name,Girl_Name,Boy_Age,Girl_Age,Compatibility_Score,Boy_FavColor_Black,Boy_FavColor_Blue,Boy_FavColor_Brown,Boy_FavColor_Green,Boy_FavColor_Orange,...,Girl_Zodiac_Cancer,Girl_Zodiac_Capricorn,Girl_Zodiac_Gemini,Girl_Zodiac_Leo,Girl_Zodiac_Libra,Girl_Zodiac_Pisces,Girl_Zodiac_Sagittarius,Girl_Zodiac_Scorpio,Girl_Zodiac_Taurus,Girl_Zodiac_Virgo
0,Richard,Kimberly,37,23,82,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Joseph,Kimberly,19,24,100,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,Richard,Linda,40,40,100,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,Richard,Laura,26,39,94,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,Richard,Nancy,37,34,104,False,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [4]:
# Features and target variable
X = data_encoded.drop(['Boy_Name', 'Girl_Name', 'Compatibility_Score'], axis=1)
y = (data['Compatibility_Score'] > 70).astype(int)  # Binary classification: High Compatibility (1) vs Low Compatibility (0)

In [5]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (12000, 86)
Testing data shape: (3000, 86)


In [6]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}

# Train and evaluate models
model_scores = {}

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    # Predict and evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    model_scores[name] = accuracy
    print(f"{name}: Accuracy = {accuracy:.2f}")

Logistic Regression: Accuracy = 0.95
Random Forest: Accuracy = 0.95
Support Vector Machine: Accuracy = 0.95
K-Nearest Neighbors: Accuracy = 0.94


In [7]:
# Find the best model
best_model_name = max(model_scores, key=model_scores.get)
best_model = models[best_model_name]

print(f"Best Model: {best_model_name} with Accuracy = {model_scores[best_model_name]:.2f}")

Best Model: Random Forest with Accuracy = 0.95


In [8]:
# Save the best model
joblib.dump(best_model, f"{best_model_name.replace(' ', '_').lower()}_model.joblib")

print(f"Best model saved as {best_model_name.replace(' ', '_').lower()}_model.joblib")

Best model saved as random_forest_model.joblib


In [9]:
import joblib

# After preprocessing training data
training_columns = list(X_train.columns)
with open('training_columns.joblib', 'wb') as f:
    joblib.dump(training_columns, f)