In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, recall_score, precision_score
import pickle

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/evgpat/stepik_from_idea_to_mvp/main/datasets/clients.csv")

In [3]:
df = df[df['satisfaction'].isin(['neutral or dissatisfied','satisfied'])]

In [4]:
df['satisfaction'].value_counts()

satisfaction
neutral or dissatisfied    58879
satisfied                  45025
Name: count, dtype: int64

In [5]:
features = ['Leg room service',
 'Baggage handling',
 'On-board service',
 'Age',
 'Checkin service',
 'Type of Travel',
 'satisfaction']

In [6]:
df = df[features]

In [7]:
df = df.dropna(subset=['Type of Travel'])

In [8]:
df['Type of Travel'] = df['Type of Travel'].map({'Business travel': 1, 'Personal Travel': 0})

In [9]:
X = df.drop(['satisfaction'], axis=1)
y = df['satisfaction']

In [10]:
y = y.map({'satisfied': 1, 'neutral or dissatisfied': 0})

In [11]:
df['Type of Travel'].unique()

array([1, 0], dtype=int64)

In [12]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [13]:
transformer = ColumnTransformer([
    ('numeric', numeric_pipeline, X.columns)
])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=25)

In [15]:
transformer.fit(X_train)

X_train_tranformed = pd.DataFrame(transformer.transform(X_train), columns=X.columns)
X_test_tranformed = pd.DataFrame(transformer.transform(X_test), columns=X.columns)

In [16]:
model = RandomForestClassifier(n_estimators=50)

In [17]:
model.fit(X_train_tranformed, y_train)

In [18]:
y_pred = model.predict_proba(X_test_tranformed)

In [19]:
classes = (y_pred[:, 1] > 0.5).astype(int)

In [20]:
classes

array([1, 0, 1, ..., 1, 0, 0])

In [21]:
pd.Series(classes).value_counts()

0    15001
1    10953
Name: count, dtype: int64

In [22]:
with open('model.pickle', 'wb') as file:
    pickle.dump(model, file)

In [23]:
with open('transformer.pickle', 'wb') as f:
    pickle.dump(transformer, f)

In [24]:
data = {
        'Leg room service': 0,
        'Baggage handling': 10,
        'On-board service': 0,
        'Age': 10,
        'Checkin service': 10,
        'Type of Travel': 0
    }

df = pd.DataFrame(data, index=[0])

In [25]:
pd.DataFrame(model.predict_proba(df), )

Unnamed: 0,0,1
0,0.2,0.8


In [26]:
X_test['classes'] = classes

In [27]:
X_test

Unnamed: 0,Leg room service,Baggage handling,On-board service,Age,Checkin service,Type of Travel,classes
129341,5.0,3.0,3.0,57.0,5.0,0,1
97478,4.0,4.0,2.0,30.0,3.0,1,0
26931,1.0,2.0,1.0,29.0,2.0,1,1
106605,2.0,2.0,4.0,27.0,3.0,1,0
110042,2.0,2.0,1.0,50.0,1.0,1,0
...,...,...,...,...,...,...,...
26006,4.0,3.0,2.0,18.0,1.0,1,0
53355,4.0,2.0,2.0,27.0,2.0,1,0
1041,4.0,4.0,4.0,37.0,3.0,1,1
53170,3.0,3.0,5.0,36.0,2.0,1,0
