In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import Data

In [None]:
dating = pd.read_csv('../input/speed-dating/speeddating.csv')
dating.head()

In [None]:
dating.isnull().sum()

Drop irrelevant columns

In [None]:
list(dating.columns.values)

Attributes lists

In [None]:
dating = dating.drop(columns=['has_null',
                              'wave',
                              'expected_happy_with_sd_people',
                              'expected_num_interested_in_me',
                              'expected_num_matches',
                              'd_expected_happy_with_sd_people',
                              'd_expected_num_interested_in_me',
                              'd_expected_num_matches',
                              'decision',
                              'decision_o',
                              'd_importance_same_race',
                              'd_importance_same_religion',
                              'd_pref_o_attractive',
                              'd_pref_o_sincere',
                              'd_pref_o_intelligence',
                              'd_pref_o_funny',
                              'd_pref_o_ambitious',
                              'd_pref_o_shared_interests',
                              'd_attractive_o',
                              'd_sinsere_o',
                              'd_intelligence_o',
                              'd_funny_o',
                              'd_ambitous_o',
                              'd_shared_interests_o',
                              'd_attractive_important',
                              'd_sincere_important',
                              'd_intellicence_important',
                              'd_funny_important',
                              'd_ambtition_important',
                              'd_shared_interests_important',
                              'd_attractive',
                              'd_sincere',
                              'd_intelligence',
                              'd_funny',
                              'd_ambition',
                              'd_attractive_partner',
                              'd_sincere_partner',
                              'd_intelligence_partner',
                              'd_funny_partner',
                              'd_ambition_partner',
                              'd_shared_interests_partner',
                              'd_sports',
                              'd_tvsports',
                              'd_exercise',
                              'd_dining',
                              'd_museums',
                              'd_art',
                              'd_hiking',
                              'd_gaming',
                              'd_clubbing',
                              'd_reading',
                              'd_tv',
                              'd_theater',
                              'd_movies',
                              'd_concerts',
                              'd_music',
                              'd_shopping',
                              'd_yoga',
                              'd_interests_correlate',
                              'd_like',
                              'd_guess_prob_liked'
                             ], axis=1)
dating.head()

In [None]:
list(dating.columns.values)

Clean Data

Code from https://www.kaggle.com/polarbearyap/speeddating-part-ii

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin, clone
import re


# Use a custom transformer for data preprocessing
class DataCleaner(BaseEstimator, TransformerMixin):

    def __init__(self, y_feature):
        self.y_feature = y_feature
        self.features_with_wrong_data_type = []
        self.numerical_features = []
        self.categorical_features = []
        self.features_with_invalid_value = []
        self.one_hot_features = []
        self.invalid_values = set()

    # Getter for numerical features
    def getNumericalFeatures(self):
        return self.numerical_features

    # Getter for categorical features
    def getCategoricalFeatures(self):
        return self.categorical_features

    # Getter for collected invalid values
    def getInvalidValues(self):
        return self.invalid_values

    # Detect integer value in data using regex/regular expression
    def detect_int_value(self, data):
        return np.any(data.astype(str).str.contains('^\d+$', regex=True))

    # Detect integer value in data using regex/regular expression
    def detect_float_value(self, data):
        return np.any(data.astype(str).str.contains('^-?\d+\.\d+$|^\d+$', regex=True))

    # Detect invalid integer value in data using regex/regular expression
    def get_invalid_int_value(self, data):
        return ', '.join(data[~data.astype(str).str.contains('^\d+$', regex=True)]
                         .value_counts().index.to_list())

    # Detect invalid float value in data using regex/regular expression
    def get_invalid_float_value(self, data):
        return ', '.join(data[~data.astype(str).str.contains('^-?\d+\.\d+$|^\d+$', regex=True)]
                         .value_counts().index.to_list())

    def drop_rows_with_unknow_values(self, data, feature):
        return data[~data[feature].isna()]

    def find_invalid_values(self, data):
        # Iterates all columns in the dating dataset and detect data types automatically
        for feature in data.columns.values:

            # Check if the features casted as object should be casted with float
            if data[feature].dtype == 'object':
                # If the features should be casted with float, flag the feature as 'features_with_wrong_data_type'
                if self.detect_float_value(data[feature]):
                    data[feature] = data[feature].astype(
                        'float64', errors='ignore')
                    invalid_value = self.get_invalid_float_value(data[feature])
                    # If invalid values are found, flag the feature as 'features_with_invalid_value'
                    if invalid_value != '':
                        self.invalid_values.add(invalid_value)
                        self.features_with_invalid_value.append(feature)
                    self.features_with_wrong_data_type.append(feature)
                # If the feature is actually categorical, flag the feature as 'categorical_features'
                else:
                    self.categorical_features.append(feature)

            # Check for invalid integer value in numerical columns with 'int64' datatype
            if data[feature].dtype == 'int64':
                invalid_value = self.get_invalid_int_value(data[feature])
                if invalid_value != '':
                    self.invalid_values.add(invalid_value)
                    self.features_with_invalid_value.append(feature)
                data[feature] = data[feature].astype('float64', errors='raise')
                self.numerical_features.append(feature)

            # Check for invalid integer value in numerical columns with 'float64' datatype
            elif data[feature].dtype == 'float64':
                invalid_value = self.get_invalid_float_value(data[feature])
                if invalid_value != '':
                    self.invalid_values.add(invalid_value)
                    self.features_with_invalid_value.append(feature)
                self.numerical_features.append(feature)

    def fit(self, data, y=None):

        # Detect any numerical features casted with 'object' data type and with invalid values
        self.find_invalid_values(data)

        return self

    def transform(self, data, y=None):

        # Replace '?' value with NaN
        data = data.replace('^\?$', np.NaN, regex=True)

        # Change numerical features with 'object' data type and change to 'float64'
        for feature in self.features_with_invalid_value:
            data[feature] = data[feature].astype('float64', errors='raise')

        # Add the fixed features back to numerical features
        self.numerical_features += self.features_with_invalid_value

        # Remove unwanted quotes: change values like ''Example'' to 'Example'
        for feature in self.categorical_features:
            for value in data[feature].value_counts().index:
                if re.search('^\'.+\'$', value.replace(' ', '')):
                    index = data[data[feature] == value].index
                    data.loc[index, feature] = value[1:-1]

        return data

In [None]:
cleaner = DataCleaner('match')
dating1 = cleaner.fit_transform(dating.copy())

In [None]:
print(f'Invalid values found: {cleaner.getInvalidValues()}')

In [None]:
print('List of numerical features:')
num_attr = cleaner.getNumericalFeatures()
num_attr.remove('match')
num_attr

In [None]:
print('List of categorical features:')
cat_attr = cleaner.getCategoricalFeatures()
cat_attr

In [None]:
dating1.head()

Imputation of missing values with most frequent values

In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='most_frequent')
dating2 = pd.DataFrame(imp.fit_transform(dating1))
dating2.columns = dating1.columns.values
dating2.head()

X and Y split

In [None]:
X = dating2.drop(columns=['match'])
X.head()

In [None]:
y = dating2['match']
y=y.astype('int')
y.head()

Smote imbalance - SMOTE-NC and Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.3, random_state=42, stratify=y)

SMOTENC
0,4,5,6,10 --> locations of categorical data

In [None]:
from imblearn.over_sampling import SMOTENC
smotenc = SMOTENC([0, 4, 5, 6, 10],random_state = 42)
X_oversample, y_oversample = smotenc.fit_resample(X_train, y_train)

Random Forest Classifier

In [None]:
from sklearn. import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators':range(10,60,10)}
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=10)
grid.fit(X_oversample, y_oversample)
grid.best_params_
grid.best_score_