In [3]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('Data.csv', encoding='latin1')

# Normalize the importance ratings
for rating in ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1']:
    df.loc[df[rating] > 10, rating] = df[rating] / 10

# Handle missing data
df = df.dropna(subset=['attr3_1', 'sinc3_1', 'intel3_1', 'fun3_1', 'amb3_1', 'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1'])

# Create a new dataframe for the pairs
columns = ['attr3_1', 'sinc3_1', 'intel3_1', 'fun3_1', 'amb3_1', 'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1']
new_df = pd.DataFrame(columns=columns + ['match'])

# Iterate over the rows of the original dataframe
for i, row in df.iterrows():
    # Find the partner row
    partner_row = df[df['iid'] == row['pid']]
    if partner_row.empty:
        continue

    # Compute the absolute difference of the ratings and add to the new dataframe
    diff = np.abs(row[columns] - partner_row.iloc[0][columns])
    diff['match'] = row['match']
    new_df = pd.concat([new_df, pd.DataFrame(diff).transpose()])

# Save the new dataframe to a CSV file
new_df.to_csv('NewData.csv', index=False)


In [4]:
import pandas as pd

# Load the new dataset
df = pd.read_csv('NewData.csv')

# Display a statistical summary of the dataset
print(df.describe())


           attr3_1      sinc3_1     intel3_1       fun3_1       amb3_1  \
count  8122.000000  8122.000000  8122.000000  8122.000000  8122.000000   
mean      1.505787     1.517360     1.151441     1.703275     1.956907   
std       1.297238     1.324936     0.998689     1.426735     1.575382   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       1.000000     1.000000     0.000000     1.000000     1.000000   
50%       1.000000     1.000000     1.000000     1.000000     2.000000   
75%       2.000000     2.000000     2.000000     2.000000     3.000000   
max       7.000000     8.000000     7.000000     8.000000     8.000000   

           attr1_1      sinc1_1     intel1_1       fun1_1       amb1_1  \
count  8122.000000  8122.000000  8122.000000  8122.000000  8122.000000   
mean      2.433962     2.276071     1.541192     2.352602     3.805449   
std       2.790478     3.034659     2.512859     3.203256     3.399164   
min       0.000000     0.000000     0

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV

# Load the dataset
df = pd.read_csv('NewData.csv')

# Split the dataset into features and target variable
X = df.drop('match', axis=1)
y = df['match']

# Split the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to generate synthetic samples
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# List of models
models = [
    ('KNN', KNeighborsClassifier(), {'n_neighbors': list(range(1, 31))}),
    ('SVM', SVC(probability=True), {'C': [0.1, 1], 'kernel': ['rbf']}),
    ('Logistic Regression', LogisticRegression(), {'C': [0.001, 0.01]}),
    ('Gradient Boosting', GradientBoostingClassifier(), {'n_estimators': [100], 'learning_rate': [0.1]})
]

# Train and evaluate each model
for name, model, params in models:
    # Tune hyperparameters using RandomizedSearchCV with reduced iterations
    random_search = RandomizedSearchCV(model, params, n_iter=5)
    random_search.fit(X_train_smote, y_train_smote)
    
    # Train the model with the best parameters on the oversampled training set
    best_model = random_search.best_estimator_
    best_model.fit(X_train_smote, y_train_smote)
    
    # Make predictions on the test set
    y_pred = best_model.predict(X_test)
    
    # Print the confusion matrix and classification report
    print(f'{name}:')
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    # Make probability predictions on the test set and print percentage compatibility
    y_prob = best_model.predict_proba(X_test)
    compatibility_percentage = y_prob[:, 1]
    print(f'Percentage Compatibility: {compatibility_percentage}')


KNN:
[[848 502]
 [ 48 227]]
              precision    recall  f1-score   support

           0       0.95      0.63      0.76      1350
           1       0.31      0.83      0.45       275

    accuracy                           0.66      1625
   macro avg       0.63      0.73      0.60      1625
weighted avg       0.84      0.66      0.70      1625

Percentage Compatibility: [0.         0.22222222 0.         ... 0.66666667 0.11111111 0.22222222]




KeyboardInterrupt: 

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Load the new dataset
df = pd.read_csv('NewData.csv')

# Split the dataset into features and target variable
X = df.drop('match', axis=1)
y = df['match']

# Split the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the models and their respective hyperparameters
models = {
    'KNN': (KNeighborsClassifier(), {'n_neighbors': range(1, 31), 'weights': ['uniform', 'distance']}),
    'Logistic Regression': (LogisticRegression(), {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}),
    'Gradient Boosting': (GradientBoostingClassifier(), {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 1], 'max_depth': range(3, 6)})
}

# Train each model and display their performance
for name, (model, params) in models.items():
    print(f'Training {name}...')
    clf = RandomizedSearchCV(model, params)
    clf.fit(X_train, y_train)
    print(f'Best parameters for {name}: {clf.best_params_}')
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))


Training KNN...
Best parameters for KNN: {'weights': 'distance', 'n_neighbors': 20}
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1350
           1       1.00      0.83      0.90       275

    accuracy                           0.97      1625
   macro avg       0.98      0.91      0.94      1625
weighted avg       0.97      0.97      0.97      1625

Training Logistic Regression...


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\N\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\N\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\N\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, se

Best parameters for Logistic Regression: {'penalty': 'l2', 'C': 0.001}
              precision    recall  f1-score   support

           0       0.83      1.00      0.91      1350
           1       0.00      0.00      0.00       275

    accuracy                           0.83      1625
   macro avg       0.42      0.50      0.45      1625
weighted avg       0.69      0.83      0.75      1625

Training Gradient Boosting...
Best parameters for Gradient Boosting: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 1}
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1350
           1       0.92      0.85      0.88       275

    accuracy                           0.96      1625
   macro avg       0.95      0.92      0.93      1625
weighted avg       0.96      0.96      0.96      1625



In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the new dataset
df = pd.read_csv('NewData.csv')

# Split the dataset into features and target variable
X = df.drop('match', axis=1)
y = df['match']

# Split the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the models and their respective hyperparameters
models = {
    'KNN': (KNeighborsClassifier(), {'n_neighbors': range(1, 31), 'weights': ['uniform', 'distance']}),
    'Logistic Regression': (LogisticRegression(), {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l2'], 'solver': ['liblinear', 'lbfgs']}),
    'Gradient Boosting': (GradientBoostingClassifier(), {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 1], 'max_depth': range(3, 6)})
}

# Train each model and display their performance
for name, (model, params) in models.items():
    print(f'Training {name}...')
    clf = RandomizedSearchCV(model, params)
    clf.fit(X_train, y_train)
    print(f'Best parameters for {name}: {clf.best_params_}')
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))


Training KNN...
Best parameters for KNN: {'weights': 'distance', 'n_neighbors': 30}
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1350
           1       1.00      0.83      0.90       275

    accuracy                           0.97      1625
   macro avg       0.98      0.91      0.94      1625
weighted avg       0.97      0.97      0.97      1625

[[1350    0]
 [  48  227]]
Training Logistic Regression...
Best parameters for Logistic Regression: {'solver': 'liblinear', 'penalty': 'l2', 'C': 1}
              precision    recall  f1-score   support

           0       0.83      1.00      0.91      1350
           1       0.00      0.00      0.00       275

    accuracy                           0.83      1625
   macro avg       0.42      0.50      0.45      1625
weighted avg       0.69      0.83      0.75      1625

[[1350    0]
 [ 275    0]]
Training Gradient Boosting...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best parameters for Gradient Boosting: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 1}
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1350
           1       0.92      0.85      0.88       275

    accuracy                           0.96      1625
   macro avg       0.95      0.92      0.93      1625
weighted avg       0.96      0.96      0.96      1625

[[1330   20]
 [  42  233]]


In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load the new dataset
df = pd.read_csv('NewData.csv')

# Split the dataset into features and target variable
X = df.drop('match', axis=1)
y = df['match']

# Split the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the model and its respective hyperparameters
model = KNeighborsClassifier()
params = {'n_neighbors': range(1, 31), 'weights': ['uniform', 'distance']}

# Train the model
print(f'Training KNN...')
clf = RandomizedSearchCV(model, params)
clf.fit(X_train, y_train)
print(f'Best parameters for KNN: {clf.best_params_}')
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Save the trained model to a file
joblib.dump(clf, 'knn_model1.pkl')


Training KNN...
Best parameters for KNN: {'weights': 'distance', 'n_neighbors': 28}
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1350
           1       1.00      0.83      0.90       275

    accuracy                           0.97      1625
   macro avg       0.98      0.91      0.94      1625
weighted avg       0.97      0.97      0.97      1625

[[1350    0]
 [  48  227]]


['knn_model1.pkl']

In [26]:
import joblib
import numpy as np

# Load the saved model
model = joblib.load('knn_model.pkl')

# Ask for user inputs
ratings = []
for partner in ['PARTNER A', 'PARTNER B']:
    print(f'\n{partner}')
    print('\nSelf-ratings(1-10)')
    ratings.append(float(input('1.) Attractiveness: ')))
    ratings.append(float(input('2.) Sincerity: ')))
    ratings.append(float(input('3.) Intelligence: ')))
    ratings.append(float(input('4.) Fun: ')))
    ratings.append(float(input('5.) Ambition: ')))
    print('\nImportance Ratings(1-10)')
    ratings.append(float(input('6.) Attractiveness: ')))
    ratings.append(float(input('7.) Sincerity: ')))
    ratings.append(float(input('8.) Intelligence: ')))
    ratings.append(float(input('9.) Fun: ')))
    ratings.append(float(input('10.) Ambition: ')))

# Preprocess the inputs to get the absolute difference of the partners' ratings
inputs = np.abs(np.array(ratings[:10]) - np.array(ratings[10:]))

# Feed the inputs to the model
prediction = model.predict([inputs])
probabilities = model.predict_proba([inputs])

# Output the model's prediction
if prediction[0] == 1:
    print(f"\nWith a confidence level of {probabilities[0][1] * 100:.2f}%, the AI model predicts you are likely to be a match!")
else:
    print(f"\nWith a confidence level of {probabilities[0][0] * 100:.2f}%, the AI model predicts you are not likely to be a match.")



PARTNER A

Self-ratings(1-10)


1.) Attractiveness:  9
2.) Sincerity:  7
3.) Intelligence:  9
4.) Fun:  6
5.) Ambition:  8



Importance Ratings(1-10)


6.) Attractiveness:  6
7.) Sincerity:  8
8.) Intelligence:  5
9.) Fun:  7
10.) Ambition:  4



PARTNER B

Self-ratings(1-10)


1.) Attractiveness:  6
2.) Sincerity:  4
3.) Intelligence:  9
4.) Fun:  4
5.) Ambition:  8



Importance Ratings(1-10)


6.) Attractiveness:  6
7.) Sincerity:  8
8.) Intelligence:  4
9.) Fun:  7
10.) Ambition:  8



With a confidence level of 77.00%, the AI model predicts you are not likely to be a match.


In [23]:
import joblib
import numpy as np

# Load the saved model
model = joblib.load('knn_model.pkl')

# Initialize a counter for the number of matches found
matches_found = 1

# Define attribute names
attributes = ['Attractiveness', 'Sincerity', 'Intelligence', 'Fun', 'Ambition']

# Continue generating inputs until 10 matches are found
while matches_found < 20:
    # Generate random inputs for two partners
    ratings = np.random.randint(1, 11, size=20)

    # Preprocess the inputs to get the absolute difference of the partners' ratings
    inputs = np.abs(ratings[:10] - ratings[10:])

    # Feed the inputs to the model
    prediction = model.predict([inputs])

    # If the model predicts a match, print out the inputs and increment the counter
    if prediction[0] == 1:
        print(f'\nMatch {matches_found}] found with inputs:')
        for partner in ['A', 'B']:
            print(f'\nPARTNER {partner}')
            print('\nSelf-ratings(1-10)')
            for i in range(5):
                print(f'{attributes[i]}: {ratings[i]}')
            print('\nImportance Ratings(1-10)')
            for i in range(5, 10):
                print(f'{attributes[i-5]}: {ratings[i]}')
            ratings = ratings[10:]
        matches_found += 1



Match 1] found with inputs:

PARTNER A

Self-ratings(1-10)
Attractiveness: 3
Sincerity: 3
Intelligence: 8
Fun: 2
Ambition: 1

Importance Ratings(1-10)
Attractiveness: 3
Sincerity: 1
Intelligence: 7
Fun: 3
Ambition: 1

PARTNER B

Self-ratings(1-10)
Attractiveness: 1
Sincerity: 4
Intelligence: 5
Fun: 10
Ambition: 1

Importance Ratings(1-10)
Attractiveness: 6
Sincerity: 1
Intelligence: 7
Fun: 3
Ambition: 2

Match 2] found with inputs:

PARTNER A

Self-ratings(1-10)
Attractiveness: 5
Sincerity: 6
Intelligence: 3
Fun: 5
Ambition: 5

Importance Ratings(1-10)
Attractiveness: 3
Sincerity: 2
Intelligence: 7
Fun: 7
Ambition: 8

PARTNER B

Self-ratings(1-10)
Attractiveness: 2
Sincerity: 4
Intelligence: 1
Fun: 5
Ambition: 5

Importance Ratings(1-10)
Attractiveness: 4
Sincerity: 2
Intelligence: 7
Fun: 4
Ambition: 2

Match 3] found with inputs:

PARTNER A

Self-ratings(1-10)
Attractiveness: 10
Sincerity: 3
Intelligence: 3
Fun: 9
Ambition: 4

Importance Ratings(1-10)
Attractiveness: 8
Sincerity: 10
