In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/profiles-dating-app/profiles.csv')
print(data.columns)
print(data.describe())

Note: Only columns with few categories are really suited for KNNClassifier!

# Questions

Predictions:

- Can body type be predicted by the dating profile info
- Can sexual orientation be predicted by the dating profile info
- Can sex be predicted by the essays in a dating profile

 # 1 Data Preparation

Handling missing values:

In [None]:
data = data.dropna()
data.reset_index(drop=True, inplace=True)
print(len(data))


Splitting feature cols from target col

In [None]:
feature_cols = data.loc[:, data.columns != 'body_type']
target_col = data['body_type']

Choosing suitable features (Essays aren't useful for non-nlp-application, because they are neither numerical nor categorical)

In [None]:
rel_cols = []
for i in feature_cols.columns:
    print('Column: {}, different values: {}'.format(i, feature_cols[i].nunique()))
    if feature_cols[i].nunique() < 100:
        rel_cols.append(i)
        
print(rel_cols)

One hot encoding of categorical features results in the following dataframe (only the first five lines are shown):

In [None]:
features = feature_cols[rel_cols]

features_dummies = pd.get_dummies(features)
features_dummies.head()

Data is now cleaned and encoded and therefore ready for EDA and ML.

# 2 Data Exploration

## 2.1 Demographics

Starting off with overall age distribution, it looks like the majority (75%) of users are between 20 and 40 years old. 50 percent of the users are older than 30 years:

In [None]:
age = data['age']

plt.hist(age, bins=50)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Number of People')
plt.show()

print(age.describe())

Age distribution by sex shows that age is evenly distributed among the sexes. It also shows that there are more females in every age bracket than males.

In [None]:
import plotly.express as px

fig = px.histogram(data, x="age", color="sex")

fig.show()

The height distribution by sex shows that most females tend to be smaller than most males. This visualisation could be improved by displaying the relative distribution of size among each sex instead of absolute numbers. This would visually correct for the fact that there are more females than males in the data set (as shown above).

In [None]:
fig = px.histogram(data, x="height", color="sex")

fig.show()

## 2.2 Habits

The first analyzed habit is smoking. It seems like the vast majority of users don´t smoke.

In [None]:
plt.hist(data['smokes'])
plt.ylabel('Number of profiles')
plt.xlabel('smoking status')
plt.show()

Further questions that could lead to interesting insights:
    
* What are the most common sexual orientations among the users?
* What are the most common languages?
* What does the income distribution look like?
* How many users have kids and how many pets?
* Are there correlations among the features?
* Can a tool be built that allows you to determine the percentage of the population that fits your preferences?

# 3 Predictions

## 3.1 Body Type Prediction

In [None]:
X = features_dummies
y= target_col



X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)





In [None]:
from xgboost import XGBClassifier



xgbcl = XGBClassifier()


xgbcl.fit(X_train,y_train)

xgbcl.score(X_test,y_test)

Score is very low. Not sure if this can be suffienctly improved by grid search, feature selection/engineering and other measures.

Using cross validation:

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(xgbcl, X,y,cv=5)
print("Cross validation scores: {}".format(scores))

Results are  still very low. Precision seems quite consistent over different data.

What features influence this classification?

In [None]:
from xgboost import plot_importance

num_features = 20

print('The importance of the {} most important features:'.format(num_features))

importance = plot_importance(xgbcl, ax=None, height=1, xlim=None, ylim=None, title='Feature importance', xlabel='F score', ylabel='Features', fmap='', importance_type='weight', max_num_features = num_features, grid=True, show_values=True)

Age is by far the most important feature for this prediction, followed by height and then income. What does this mean for the model and feature selection?

In [None]:
arr = xgbcl.feature_importances_

sorted_index_array = np.argsort(arr) 
  

sorted_array = arr[sorted_index_array] 
    
n = 15

rslt = sorted_array[-n : ] 

print("{} largest values:".format(n), 
      rslt) 

Can feature selection improve the performance of the model?

Conclusion: I think the data set doesn´t contain enough information to reliably predict a persons body type.

## 3.2 Sexual Orientation Prediction

There are three categories of sexual orientation an user can choose from:

In [None]:
print(data['orientation'].unique())

Splitting feature cols from target col

In [None]:
feature_cols = data.loc[:, data.columns != 'orientation']
target_col = data['orientation']

Choosing suitable features (Essays aren't useful for non-nlp-application, because they are neither numerical nor categorical)

In [None]:
rel_cols = []
for i in feature_cols.columns:
    print('Column: {}, different values: {}'.format(i, feature_cols[i].nunique()))
    if feature_cols[i].nunique() < 100:
        rel_cols.append(i)
        
print(rel_cols)

One hot encoding of categorical features results in the following dataframe (only the first five lines are shown):

In [None]:
features = feature_cols[rel_cols]

features_dummies = pd.get_dummies(features)
features_dummies.head()

Data is ready for prediction

In [None]:
X = features_dummies
y= target_col

xgbcl2 = XGBClassifier()

scores = cross_val_score(xgbcl, X,y,cv=5)
print("Cross validation scores: {}".format(scores))