In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
import shap
import matplotlib.pyplot as plt

In [2]:
df= pd.read_csv('speed_dating_data.csv')
df

Unnamed: 0,gender,age,income,goal,career,dec,attr,sinc,intel,fun,amb,shar,like,prob,met
0,0,21.0,69487.0,2.0,lawyer,1,6.0,9.0,7.0,7.0,6.0,5.0,7.0,6.0,2.0
1,0,21.0,69487.0,2.0,lawyer,1,7.0,8.0,7.0,8.0,5.0,6.0,7.0,5.0,1.0
2,0,21.0,69487.0,2.0,lawyer,1,5.0,8.0,9.0,8.0,5.0,7.0,7.0,,1.0
3,0,21.0,69487.0,2.0,lawyer,1,7.0,6.0,8.0,7.0,6.0,8.0,7.0,6.0,2.0
4,0,21.0,69487.0,2.0,lawyer,1,5.0,6.0,7.0,7.0,6.0,6.0,6.0,6.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,1,25.0,,1.0,assistant master of the universe (otherwise it...,0,3.0,5.0,5.0,5.0,,,2.0,5.0,0.0
8374,1,25.0,,1.0,assistant master of the universe (otherwise it...,0,4.0,6.0,8.0,4.0,4.0,,4.0,4.0,0.0
8375,1,25.0,,1.0,assistant master of the universe (otherwise it...,0,4.0,7.0,8.0,8.0,8.0,,6.0,5.0,0.0
8376,1,25.0,,1.0,assistant master of the universe (otherwise it...,0,4.0,6.0,5.0,4.0,,5.0,5.0,5.0,0.0


In [3]:
df.columns #target: dec, features: income, attr, sinc, intel, fun, amb, 

Index(['gender', 'age', 'income', 'goal', 'career', 'dec', 'attr', 'sinc',
       'intel', 'fun', 'amb', 'shar', 'like', 'prob', 'met'],
      dtype='object')

In [4]:
df['career'].unique()

array(['lawyer', 'law', 'Economist', 'lawyer/policy work', 'Law',
       'Journalist', 'Congresswoman, and comedian',
       'To create early childhood intervention programs',
       'Academia, Research, Banking, Life', 'Corporate Lawyer', 'Lawyer',
       'Corporate attorney', 'research/financial industry',
       'academics or journalism', 'Financial Services', 'ceo', 'CEO',
       'Undecided', 'Informatics', 'psychologist',
       'health/nutrition oriented social worker', 'Social Worker',
       'Social work with children', 'Speech Language Pathologist', nan,
       'Social Work Administration', 'Professor', "Clidren's TV",
       'Banking', 'Capital Markets', 'Biostatistics',
       'Organizational Change Consultant', 'tech professional',
       'Engineer', 'Academic', 'academia', 'banker / academia', 'banker',
       'Music production', 'Entrepreneur',
       'Intellectual Property Attorney', 'Medicine', 'consulting',
       'LAWYER', 'social worker', 'comedienne', 'attorney',
  

In [5]:
len(df['career'].unique())

368

In [6]:
df.shape

(8378, 15)

In [7]:
df.dtypes

gender      int64
age       float64
income    float64
goal      float64
career     object
dec         int64
attr      float64
sinc      float64
intel     float64
fun       float64
amb       float64
shar      float64
like      float64
prob      float64
met       float64
dtype: object

In [8]:
df['id'] = df.set_index(['gender', 'age', 'income', 'career']).index.factorize()[0]+1

In [9]:
# dropping cols that are partner-dependent or have too many missings
df = df.drop(columns=['income', 'prob', 'shar', 'met', 'career'])

In [10]:
average_scores = df.groupby('id')[['attr', 'sinc', 'intel', 'fun', 'amb', 'like']].mean().reset_index()
df = df.groupby('id').first()
df = df.drop(columns = ['attr', 'sinc', 'intel', 'fun', 'amb', 'like']).join(average_scores.set_index('id'), on = 'id')
df

Unnamed: 0_level_0,gender,age,goal,dec,attr,sinc,intel,fun,amb,like
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,21.0,2.0,1,5.700000,7.300000,7.300000,6.800000,6.300000,6.500000
2,0,24.0,1.0,0,6.400000,7.000000,7.700000,6.100000,6.500000,6.600000
3,0,25.0,6.0,0,8.100000,8.600000,9.400000,7.700000,8.800000,8.200000
4,0,23.0,1.0,0,6.400000,8.900000,8.600000,7.800000,7.800000,6.600000
5,0,21.0,2.0,0,6.300000,6.000000,7.000000,6.000000,5.600000,7.200000
...,...,...,...,...,...,...,...,...,...,...
535,1,30.0,1.0,1,4.476190,6.714286,5.809524,4.476190,3.904762,5.047619
536,1,28.0,1.0,0,4.590909,7.045455,6.909091,6.727273,6.863636,5.363636
537,1,30.0,2.0,0,5.636364,5.954545,5.772727,5.500000,5.045455,5.500000
538,1,27.0,1.0,1,6.350000,7.100000,7.400000,6.700000,6.700000,7.150000


In [11]:
df.isna().sum()

gender    0
age       4
goal      3
dec       0
attr      0
sinc      1
intel     1
fun       1
amb       1
like      0
dtype: int64

In [12]:
df= df.dropna()

In [13]:
target = df['dec']
df= df.drop(columns= ['dec'])
features = df.select_dtypes(include=['number'])
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=42)

In [14]:
df

Unnamed: 0_level_0,gender,age,goal,attr,sinc,intel,fun,amb,like
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,21.0,2.0,5.700000,7.300000,7.300000,6.800000,6.300000,6.500000
2,0,24.0,1.0,6.400000,7.000000,7.700000,6.100000,6.500000,6.600000
3,0,25.0,6.0,8.100000,8.600000,9.400000,7.700000,8.800000,8.200000
4,0,23.0,1.0,6.400000,8.900000,8.600000,7.800000,7.800000,6.600000
5,0,21.0,2.0,6.300000,6.000000,7.000000,6.000000,5.600000,7.200000
...,...,...,...,...,...,...,...,...,...
535,1,30.0,1.0,4.476190,6.714286,5.809524,4.476190,3.904762,5.047619
536,1,28.0,1.0,4.590909,7.045455,6.909091,6.727273,6.863636,5.363636
537,1,30.0,2.0,5.636364,5.954545,5.772727,5.500000,5.045455,5.500000
538,1,27.0,1.0,6.350000,7.100000,7.400000,6.700000,6.700000,7.150000


In [15]:
normalizer = MinMaxScaler()

In [16]:
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [17]:
X_train_norm 

array([[1.        , 0.28571429, 0.2       , ..., 0.63561077, 0.68640351,
        0.69330855],
       [1.        , 0.47619048, 0.        , ..., 0.55279503, 0.54605263,
        0.45167286],
       [0.        , 0.28571429, 0.        , ..., 0.59006211, 0.52236842,
        0.65241636],
       ...,
       [1.        , 0.33333333, 0.4       , ..., 0.49245785, 0.49686717,
        0.38077536],
       [1.        , 0.52380952, 0.        , ..., 0.50807453, 0.39210526,
        0.52973978],
       [1.        , 0.19047619, 0.8       , ..., 0.52173913, 0.61687307,
        0.62515489]])

In [18]:
X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)
X_train_norm.head()

Unnamed: 0,gender,age,goal,attr,sinc,intel,fun,amb,like
0,1.0,0.285714,0.2,0.653704,0.607966,0.62069,0.635611,0.686404,0.693309
1,1.0,0.47619,0.0,0.338889,0.591195,0.531034,0.552795,0.546053,0.451673
2,0.0,0.285714,0.0,0.633333,0.640252,0.605517,0.590062,0.522368,0.652416
3,0.0,0.142857,0.2,0.592593,0.930818,0.856705,0.468599,0.921053,0.625155
4,1.0,0.190476,1.0,0.388889,0.631027,0.477395,0.407867,0.388889,0.33891


In [19]:
knn = KNeighborsClassifier(n_neighbors=5)

In [20]:
knn.fit(X_train_norm, y_train)

In [21]:
pred = knn.predict(X_test_norm)  
pred



array([1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1])

In [22]:
y_test.values

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0])

In [23]:
knn.score(X_test_norm, y_test)



0.5607476635514018

In [24]:
"""explainer = shap.KernelExplainer(knn.predict_proba, X_train)
shap_values = explainer.shap_values(X_test_norm[:9])
# Save the standard summary plot
plt.figure()
shap.summary_plot(shap_values, X_test_norm[:9], show=False)  # Set show=False to prevent displaying inline
plt.savefig("shap_summary_plot.png", dpi=300, bbox_inches='tight')
plt.close()  # Close the plot to reset the figure
# Save the bar summary plot
plt.figure()
shap.summary_plot(shap_values, X_test_norm[:9], plot_type="bar", show=False)  # Bar plot type
plt.savefig("shap_summary_plot_bar.png", dpi=300, bbox_inches='tight')
plt.close()
# Save the customized summary plot with larger plot size
plt.figure()
shap.summary_plot(shap_values, X_test_norm[:9], plot_size=(10, 6), show=False)  # Set custom plot size
plt.savefig("shap_summary_plot_custom_size.png", dpi=300, bbox_inches='tight')
plt.close()"""


'explainer = shap.KernelExplainer(knn.predict_proba, X_train)\nshap_values = explainer.shap_values(X_test_norm[:9])\n# Save the standard summary plot\nplt.figure()\nshap.summary_plot(shap_values, X_test_norm[:9], show=False)  # Set show=False to prevent displaying inline\nplt.savefig("shap_summary_plot.png", dpi=300, bbox_inches=\'tight\')\nplt.close()  # Close the plot to reset the figure\n# Save the bar summary plot\nplt.figure()\nshap.summary_plot(shap_values, X_test_norm[:9], plot_type="bar", show=False)  # Bar plot type\nplt.savefig("shap_summary_plot_bar.png", dpi=300, bbox_inches=\'tight\')\nplt.close()\n# Save the customized summary plot with larger plot size\nplt.figure()\nshap.summary_plot(shap_values, X_test_norm[:9], plot_size=(10, 6), show=False)  # Set custom plot size\nplt.savefig("shap_summary_plot_custom_size.png", dpi=300, bbox_inches=\'tight\')\nplt.close()'

In [25]:
print(X_test.columns)

Index(['gender', 'age', 'goal', 'attr', 'sinc', 'intel', 'fun', 'amb', 'like'], dtype='object')


In [26]:
#for i, feature_name in enumerate(X_test.columns):
    #print(f"{feature_name}: {shap_values[i].mean()}")

In [27]:
# logistic regression

model = LogisticRegression()
model.fit(X_train_norm, y_train)


In [28]:
y_pred = model.predict(X_test_norm)



In [29]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [30]:
accuracy

0.6074766355140186

In [31]:
# random forest

rf= RandomForestClassifier(n_estimators=10, random_state=0, max_depth=3)
rf.fit(X_train_norm, y_train)

In [32]:
y_pred= rf.predict(X_test_norm)



In [33]:
accuracy= accuracy_score(y_test, y_pred)
accuracy

0.6448598130841121

In [42]:
X_test_norm.shape

(107, 9)

In [50]:
# reshape
np_array= np.array(X_test_norm)
reshaped_data = np_array.reshape(107, -1)
reshaped_data

array([[1.        , 0.66666667, 0.        , 0.40111111, 0.39811321,
        0.37034483, 0.40559006, 0.34144737, 0.45      ],
       [0.        , 0.19047619, 0.        , 0.43777778, 0.57106918,
        0.65103448, 0.42608696, 0.65263158, 0.59107807],
       [1.        , 0.52380952, 0.        , 0.62111111, 0.66792453,
        0.57517241, 0.56273292, 0.46447368, 0.62788104],
       [0.        , 0.57142857, 0.2       , 0.47619048, 0.44654088,
        0.42561576, 0.25820763, 0.28665414, 0.4070632 ],
       [0.        , 0.57142857, 0.8       , 0.35833333, 0.3341195 ,
        0.37413793, 0.30822981, 0.3125    , 0.36105948],
       [1.        , 0.33333333, 0.        , 0.46222222, 0.82012579,
        0.69655172, 0.43975155, 0.69605263, 0.7260223 ],
       [0.        , 0.38095238, 0.2       , 0.38888889, 0.37735849,
        0.33409962, 0.36991028, 0.3245614 , 0.45477076],
       [1.        , 0.28571429, 0.        , 0.6537037 , 0.60796646,
        0.51111111, 0.5521049 , 0.39692982, 0.54337051],


In [52]:


# Angenommene Daten
X_test_norm = np.random.rand(107, 9)  # Beispiel: 107 Zeilen und 9 Spalten

# Erstelle das Array
np_array = np.array(X_test_norm)

# Überprüfe die Form
print("Ursprüngliche Form:", np_array.shape)  # sollte (107, 9) oder ähnlich sein

# Versuche, das Array umzupolen
try:
    reshaped_data = np_array.reshape(107, -1)  # Die Anzahl der Spalten wird automatisch berechnet
    print("Neue Form:", reshaped_data.shape)  # sollte (107, 9) oder entsprechend sein
except ValueError as e:
    print("Fehler beim Umformen:", e)


Ursprüngliche Form: (107, 9)
Neue Form: (107, 9)


In [None]:
type(X_test_norm)

In [35]:
for i in range(3):
    tree = rf.estimators_[i]  # Wähle den i-ten Baum
    dot_data = export_graphviz(tree,
                               feature_names=X_test_norm.columns,
                               filled=True,
                               max_depth=2,  # Tiefe des Diagramms
                               impurity=False,
                               proportion=True)
    graph = graphviz.Source(dot_data)  # Erstelle das Graphviz-Objekt
    display(graph)  # Zeige

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
print(type(X_test_norm))