In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import pandas as pd

In [3]:
# Скачиваю датасет 
adult = pd.read_csv('adult.csv')
adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       46043 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      46033 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capitalgain     48842 non-null  int64 
 11  capitalloss     48842 non-null  int64 
 12  hoursperweek    48842 non-null  int64 
 13  native-country  47985 non-null  object
 14  class           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [4]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States,<=50K
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States,<=50K
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States,<=50K
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States,<=50K
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba,<=50K


In [5]:
# Проверяю, совпадают ли пропуски в столбцах
adult[pd.isnull(adult['occupation']) & pd.isnull(adult['workclass'])].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2799 entries, 27 to 48838
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             2799 non-null   int64 
 1   workclass       0 non-null      object
 2   fnlwgt          2799 non-null   int64 
 3   education       2799 non-null   object
 4   education-num   2799 non-null   int64 
 5   marital-status  2799 non-null   object
 6   occupation      0 non-null      object
 7   relationship    2799 non-null   object
 8   race            2799 non-null   object
 9   sex             2799 non-null   object
 10  capitalgain     2799 non-null   int64 
 11  capitalloss     2799 non-null   int64 
 12  hoursperweek    2799 non-null   int64 
 13  native-country  2753 non-null   object
 14  class           2799 non-null   object
dtypes: int64(6), object(9)
memory usage: 349.9+ KB


In [6]:
# Удаляю столбцы с пропусками
adult_drop = adult.dropna(subset = ['workclass', 'occupation'])

In [7]:
adult_drop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46033 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             46033 non-null  int64 
 1   workclass       46033 non-null  object
 2   fnlwgt          46033 non-null  int64 
 3   education       46033 non-null  object
 4   education-num   46033 non-null  int64 
 5   marital-status  46033 non-null  object
 6   occupation      46033 non-null  object
 7   relationship    46033 non-null  object
 8   race            46033 non-null  object
 9   sex             46033 non-null  object
 10  capitalgain     46033 non-null  int64 
 11  capitalloss     46033 non-null  int64 
 12  hoursperweek    46033 non-null  int64 
 13  native-country  45222 non-null  object
 14  class           46033 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [32]:
# В столбце native-country пропуски не совпадают с предыдущими двумя столбцами. Поэтому проверяю, как там распределяются значения.
adult_drop.groupby(adult['native-country']).count()

Unnamed: 0_level_0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
native-country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Cambodia,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26
Canada,163,163,163,163,163,163,163,163,163,163,163,163,163,163,163
China,113,113,113,113,113,113,113,113,113,113,113,113,113,113,113
Columbia,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82
Cuba,133,133,133,133,133,133,133,133,133,133,133,133,133,133,133
Dominican-Republic,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97
Ecuador,43,43,43,43,43,43,43,43,43,43,43,43,43,43,43
El-Salvador,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147
England,119,119,119,119,119,119,119,119,119,119,119,119,119,119,119
France,36,36,36,36,36,36,36,36,36,36,36,36,36,36,36


In [10]:
# Заполняю оставшиеся пропуски в столбце модой, потому что она очень сильно превышает другие значения.
adult_drop = adult_drop.fillna(adult_drop['native-country'].mode()[0])

In [11]:
adult_drop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46033 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             46033 non-null  int64 
 1   workclass       46033 non-null  object
 2   fnlwgt          46033 non-null  int64 
 3   education       46033 non-null  object
 4   education-num   46033 non-null  int64 
 5   marital-status  46033 non-null  object
 6   occupation      46033 non-null  object
 7   relationship    46033 non-null  object
 8   race            46033 non-null  object
 9   sex             46033 non-null  object
 10  capitalgain     46033 non-null  int64 
 11  capitalloss     46033 non-null  int64 
 12  hoursperweek    46033 non-null  int64 
 13  native-country  46033 non-null  object
 14  class           46033 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [13]:
# Получаю таргетные значения
le = LabelEncoder()

In [14]:
le.fit(adult_drop['class'])

LabelEncoder()

In [15]:
y_d = pd.Series(data=le.transform(adult_drop['class']))
y_d

0        0
1        0
2        0
3        0
4        0
        ..
46028    0
46029    0
46030    0
46031    0
46032    1
Length: 46033, dtype: int32

In [18]:
# Получаю таблицу с атрибутами. Это максимум, который тянет мой компьютер, дальше не стала развивать модель.
for_x = adult_drop[['age', 'workclass', 'education', 'occupation', 'marital-status', 'relationship', 'race', 'sex', 
                    'hoursperweek', 'native-country']]

In [19]:
X_d = pd.get_dummies(for_x, columns = ['age', 'workclass', 'education', 'occupation', 'marital-status', 'relationship',
                                       'race', 'sex', 'hoursperweek', 'native-country'] )
X_d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46033 entries, 0 to 48841
Columns: 108 entries, age_0 to native-country_Yugoslavia
dtypes: uint8(108)
memory usage: 5.1 MB


In [21]:
# Разделяю тестовую выборку на train и test
X_train, X_test, y_train, y_test = train_test_split(X_d, y_d, test_size=0.3, random_state=42)

In [22]:
# Строю модель логистической регрессии
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))

In [23]:
# Обучаю модель
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [24]:
model.predict_proba(X_test)

array([[0.9777347 , 0.0222653 ],
       [0.94615585, 0.05384415],
       [0.93436768, 0.06563232],
       ...,
       [0.98225517, 0.01774483],
       [0.95802816, 0.04197184],
       [0.99374763, 0.00625237]])

In [25]:
# Результаты на обучающей выборке
model.score(X_train, y_train)

0.8371349657077243

In [26]:
# Результаты на тестовой выборке
model.score(X_test, y_test)

0.8341057204923968

In [29]:
# Строю модель по методу SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) 
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [30]:
# Результаты на обучающей выборке
clf.score(X_train, y_train)

0.8472519628836546

In [31]:
# Результаты на тестовой выборке
clf.score(X_test, y_test)

0.8307023895727733

In [None]:
# В итоге результаты работы моделей примерно одинаковые.