In [135]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import seaborn as sns
from plotly.subplots import make_subplots

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, QuantileTransformer, PowerTransformer, OrdinalEncoder
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel


# import warnings
# warnings.filterwarnings('ignore')

**HELPERS**

In [71]:
IDS_TO_DROP = [
    "ID",
    "id_student"
]

TARGET = "final_result"

def cat_cols(data: pd.DataFrame):
    return data.select_dtypes(exclude="number").columns

def num_cols(data: pd.DataFrame):
    return data.select_dtypes(include="number").columns

def validate_model(model, X, y):
    y_hat = model.predict(X)
    print("f1_score: ", f1_score(y, y_hat))
    print("roc_auc_score: ", roc_auc_score(y, y_hat))
    print("accuracy_score: ", accuracy_score(y, y_hat))

### Загрузим данные

In [2]:
assessments = pd.read_csv("data/assessments.csv")
courses = pd.read_csv("data/courses.csv")
studentInfo = pd.read_csv("data/studentInfo.csv")
studentRegistration = pd.read_csv("data/studentRegistration.csv")
studentVle = pd.read_csv("data/studentVle.csv")
vle = pd.read_csv("data/vle.csv")
train_who = pd.read_csv("data/Train_Who.csv")
test_who = pd.read_csv("data/Test_Who.csv")
train_target_clf = pd.read_csv("data/Train_Target_clf.csv")

#### Склеим данные

In [3]:
# train step 1
students_train = pd.merge(train_who, studentInfo, how="inner", on=["id_student", "code_module", "code_presentation"])
students_train.info()

assert(students_train.shape[0] == train_who.shape[0])
assert(
    (students_train["id_student"] == train_who["id_student"]).value_counts().all(axis=0) == True
) # Ок смержилось хорошо

# Есть немного пропусокв в признае imd_band

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26074 entries, 0 to 26073
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ID                    26074 non-null  int64 
 1   id_student            26074 non-null  int64 
 2   code_module           26074 non-null  object
 3   code_presentation     26074 non-null  object
 4   gender                26074 non-null  object
 5   region                26074 non-null  object
 6   highest_education     26074 non-null  object
 7   imd_band              25187 non-null  object
 8   age_band              26074 non-null  object
 9   num_of_prev_attempts  26074 non-null  int64 
 10  studied_credits       26074 non-null  int64 
 11  disability            26074 non-null  object
dtypes: int64(4), object(8)
memory usage: 2.4+ MB


In [4]:
# train step 2
students_train2 = pd.merge(
    students_train,
    studentRegistration,
    how="inner",
    on=["id_student", "code_module", "code_presentation"]
)
students_train2.info()
assert(students_train2.shape[0] == train_who.shape[0])
assert(
    (students_train2["id_student"] == train_who["id_student"]).value_counts().all(axis=0)
) # Добавилось немного пропусков в date_registration

# train step 3 - добавим целевую метку
students_train3 = pd.merge(
    students_train2,
    train_target_clf,
    how="inner",
    on="ID"
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26074 entries, 0 to 26073
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ID                    26074 non-null  int64  
 1   id_student            26074 non-null  int64  
 2   code_module           26074 non-null  object 
 3   code_presentation     26074 non-null  object 
 4   gender                26074 non-null  object 
 5   region                26074 non-null  object 
 6   highest_education     26074 non-null  object 
 7   imd_band              25187 non-null  object 
 8   age_band              26074 non-null  object 
 9   num_of_prev_attempts  26074 non-null  int64  
 10  studied_credits       26074 non-null  int64  
 11  disability            26074 non-null  object 
 12  date_registration     26037 non-null  float64
dtypes: float64(1), int64(4), object(8)
memory usage: 2.6+ MB


In [5]:
# step 4 - общая часть для train/test
assessments_courses_info = pd.merge(
   assessments,
   courses,
   on=["code_module", "code_presentation"],
   how="inner"
)
assessments_courses_info = assessments_courses_info.rename({"date": "assessment_end_date"}, axis=1)
assert(assessments_courses_info.shape[0] == assessments.shape[0])

assessments_courses_info.info()
assessments_courses_info[assessments_courses_info["assessment_end_date"].isna()]
# это в основном экзамены. Потом с ними что то придумаем

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   code_module                 206 non-null    object 
 1   code_presentation           206 non-null    object 
 2   id_assessment               206 non-null    int64  
 3   assessment_type             206 non-null    object 
 4   assessment_end_date         195 non-null    float64
 5   weight                      206 non-null    float64
 6   module_presentation_length  206 non-null    int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 11.4+ KB


Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,assessment_end_date,weight,module_presentation_length
5,AAA,2013J,1757,Exam,,100.0,268
11,AAA,2014J,1763,Exam,,100.0,269
23,BBB,2013B,14990,Exam,,100.0,240
35,BBB,2013J,15002,Exam,,100.0,268
47,BBB,2014B,15014,Exam,,100.0,234
53,BBB,2014J,15025,Exam,,100.0,262
62,CCC,2014B,24290,Exam,,100.0,241
63,CCC,2014B,40087,Exam,,100.0,241
72,CCC,2014J,24299,Exam,,100.0,269
73,CCC,2014J,40088,Exam,,100.0,269


In [6]:
# step 5 - общая часть для train/test
print("studentVle count ", studentVle.shape[0])
for col in studentVle.columns:    
    print(f"nonna {col}: ", studentVle[col].count()) # Отлично пропусков нет

# Реализуем общую аггрегацию
grouping = studentVle.groupby(["id_student", "code_module", "code_presentation"], axis=0)
interaction_start = grouping["date"].min()
interaction_end = grouping["date"].max()
interaction_interval = (interaction_end - interaction_start + 1).reset_index()
interaction_end = interaction_end.reset_index().rename({"date": "date_interaction_end"}, axis=1)
interaction_interval = interaction_interval.rename({"date": "interaction_interval"}, axis=1)

sum_of_clicks = grouping["sum_click"].sum().reset_index()
sum_of_clicks = sum_of_clicks.rename({"sum_click": "sum_of_clicks"}, axis=1)

days_active = grouping.size().reset_index()
days_active = days_active.rename({0: "days_active"}, axis=1)

number_of_watched_sites = grouping["id_site"].nunique().reset_index()
number_of_watched_sites = number_of_watched_sites.rename({"id_site": "number_of_watched_sites"}, axis=1)

studentVle count  10655280
nonna code_module:  10655280
nonna code_presentation:  10655280
nonna id_student:  10655280
nonna id_site:  10655280
nonna date:  10655280
nonna sum_click:  10655280


#### Сгенерируем общую таблицу из аггрегации выше

In [7]:
# step 5 continuation
generated_features_p1 = pd.merge(
    interaction_interval,
    sum_of_clicks,
    on=["id_student", "code_module", "code_presentation"],
    how="inner"
)
assert(generated_features_p1.shape[0] == interaction_interval.shape[0] == sum_of_clicks.shape[0])
generated_features_p2 = pd.merge(
    generated_features_p1,
    interaction_end,
    on=["id_student", "code_module", "code_presentation"],
    how="inner"
)
assert(generated_features_p2.shape[0] == generated_features_p1.shape[0] == interaction_end.shape[0])
generated_features_p3 = pd.merge(
    days_active,
    number_of_watched_sites,
    on=["id_student", "code_module", "code_presentation"],
    how="inner"
)
assert(generated_features_p3.shape[0] == days_active.shape[0] == number_of_watched_sites.shape[0])

generated_features = pd.merge(
    generated_features_p2,
    generated_features_p3,
    on=["id_student", "code_module", "code_presentation"],
    how="inner"
)
assert(generated_features.shape[0] == generated_features_p2.shape[0] == generated_features_p3.shape[0])

generated_features.info() # все ок и даже пропусков нет

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29228 entries, 0 to 29227
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id_student               29228 non-null  int64 
 1   code_module              29228 non-null  object
 2   code_presentation        29228 non-null  object
 3   interaction_interval     29228 non-null  int64 
 4   sum_of_clicks            29228 non-null  int64 
 5   date_interaction_end     29228 non-null  int64 
 6   days_active              29228 non-null  int64 
 7   number_of_watched_sites  29228 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 1.8+ MB


In [8]:
# VLE выглядит крайне сомнительно - попробуем нужна ли эта таблица - не будем ее мержить
vle.sample(5)

Unnamed: 0,id_site,code_module,code_presentation,activity_type,week_from,week_to
5960,882571,FFF,2014J,oucontent,26.0,26.0
3442,924980,DDD,2014J,resource,,
436,543196,BBB,2013B,resource,,
945,704019,BBB,2013J,resource,,
4168,527470,FFF,2013B,resource,,


In [9]:
students_train3["id_student"].nunique()

23579

In [27]:
# step 6 - объединим students_train + generated_features
students_train_part1 = pd.merge(
    students_train3,
    generated_features,
    on=["id_student", "code_module", "code_presentation"],
    how="left"
)

assert(students_train_part1["id_student"] == students_train3["id_student"]).value_counts().all(axis=0)

# Пока оставим инфо о assesments & courses

# students_train_part2 = pd.merge(
#     students_train3,
#     assessments_courses_info,
#     on=["code_module",	"code_presentation"],
#     how="inner"
# )
students_train_part1[students_train_part1["sum_of_clicks"].isna()] # не для всех студентов есть информация о взаимодействии с платформой(

Unnamed: 0,ID,id_student,code_module,code_presentation,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,date_registration,final_result,interaction_interval,sum_of_clicks,date_interaction_end,days_active,number_of_watched_sites
15,15,2587950,BBB,2013J,F,North Region,HE Qualification,0-10%,0-35,4,240,N,-57.0,0,,,,,
27,27,601491,BBB,2013J,F,London Region,HE Qualification,30-40%,0-35,0,120,N,-58.0,0,,,,,
40,40,611581,FFF,2014B,M,South West Region,A Level or Equivalent,10-20,0-35,0,60,Y,-89.0,0,,,,,
45,45,650353,FFF,2014J,M,East Anglian Region,Lower Than A Level,80-90%,0-35,0,120,N,-149.0,0,,,,,
56,56,608606,FFF,2013J,M,West Midlands Region,Lower Than A Level,90-100%,35-55,0,60,N,-35.0,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26042,26042,2487785,BBB,2013J,F,Ireland,Lower Than A Level,80-90%,0-35,0,120,N,-141.0,0,,,,,
26043,26043,2625752,BBB,2013J,F,East Anglian Region,Lower Than A Level,20-30%,0-35,0,180,N,-51.0,0,,,,,
26052,26052,562267,BBB,2014B,M,North Western Region,Lower Than A Level,10-20,35-55,0,60,N,-24.0,0,,,,,
26054,26054,548321,BBB,2013B,F,South West Region,A Level or Equivalent,30-40%,0-35,0,60,N,-79.0,0,,,,,


#### Повторим для тренировочной выборки

In [24]:
# test step 1
students_test = pd.merge(test_who, studentInfo, how="inner", on=["id_student", "code_module", "code_presentation"])
students_test.info()

assert(students_test.shape[0] == test_who.shape[0])
assert(
    (students_test["id_student"] == test_who["id_student"]).value_counts().all(axis=0) == True
) # Ок смержилось хорошо
# Также есть немного пропусокв в признае imd_band

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6519 entries, 0 to 6518
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ID                    6519 non-null   int64 
 1   id_student            6519 non-null   int64 
 2   code_module           6519 non-null   object
 3   code_presentation     6519 non-null   object
 4   gender                6519 non-null   object
 5   region                6519 non-null   object
 6   highest_education     6519 non-null   object
 7   imd_band              6295 non-null   object
 8   age_band              6519 non-null   object
 9   num_of_prev_attempts  6519 non-null   int64 
 10  studied_credits       6519 non-null   int64 
 11  disability            6519 non-null   object
dtypes: int64(4), object(8)
memory usage: 611.3+ KB


In [25]:
# test step 2
students_test2 = pd.merge(
    students_test,
    studentRegistration,
    how="inner",
    on=["id_student", "code_module", "code_presentation"]
)
students_test2.info()
assert(students_test2.shape[0] == test_who.shape[0])
assert(
    (students_test2["id_student"] == test_who["id_student"]).value_counts().all(axis=0)
) # Также добавилось немного пропусков в date_registration

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6519 entries, 0 to 6518
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ID                    6519 non-null   int64  
 1   id_student            6519 non-null   int64  
 2   code_module           6519 non-null   object 
 3   code_presentation     6519 non-null   object 
 4   gender                6519 non-null   object 
 5   region                6519 non-null   object 
 6   highest_education     6519 non-null   object 
 7   imd_band              6295 non-null   object 
 8   age_band              6519 non-null   object 
 9   num_of_prev_attempts  6519 non-null   int64  
 10  studied_credits       6519 non-null   int64  
 11  disability            6519 non-null   object 
 12  date_registration     6511 non-null   float64
dtypes: float64(1), int64(4), object(8)
memory usage: 662.2+ KB


In [29]:
# step 3 - объединим students_test + generated_features
students_test_part1 = pd.merge(
    students_test2,
    generated_features,
    on=["id_student", "code_module", "code_presentation"],
    how="left"
)

assert(students_test_part1["id_student"] == students_test2["id_student"]).value_counts().all(axis=0)

(6519, 18)

### Сформируем финальные выборки

### И проведем предобработку данных

In [53]:
train_ds = students_train_part1.copy()
test_ds = students_test_part1.copy()
BOTH_DATA_SETS = [train_ds, test_ds]

In [54]:
for ds in BOTH_DATA_SETS:
    for id in IDS_TO_DROP:
        ds[id] = ds[id].astype(object) # поменяем айдишники на категориальные признаки

##### Но перед этим создадим модель и посмотрим вообще что к чему

In [85]:
num_imputer = SimpleImputer(strategy="median")

cat_imputer = SimpleImputer(strategy="most_frequent")
one_hot_encoder = OneHotEncoder(drop="if_binary", handle_unknown="ignore", sparse=False)

_X_num = train_ds[num_cols(train_ds).drop(TARGET)]
_X_cat = train_ds[cat_cols(train_ds).drop(IDS_TO_DROP)]

lr = LogisticRegression(penalty="l2", fit_intercept=True, C=0.1, solver="newton-cholesky")

_X_num = num_imputer.fit_transform(_X_num)
_X_cat = one_hot_encoder.fit_transform(
    cat_imputer.fit_transform(_X_cat)
)
_X = np.concatenate((_X_num, _X_cat), axis=1)
lr.fit(_X, train_ds[TARGET].values)

validate_model(lr, _X, train_ds[TARGET].values)



f1_score:  0.8692704722286418
roc_auc_score:  0.8722836398306029
accuracy_score:  0.8698320165682288


In [134]:
feature_names = np.concatenate(
    (
        num_imputer.get_feature_names_out(num_cols(train_ds).drop(TARGET)),
        one_hot_encoder.get_feature_names_out(cat_cols(train_ds).drop(IDS_TO_DROP)),
    )
)
coefs = pd.Series(lr.coef_.ravel(), feature_names)
coefs.sort_values(ascending=True) 
# Видим что, то какой модуль сильно влияет на возможность сдать - это логично
# Судя по всему прямопропорцинально влияет imd_band, что тоже логично
# Уровень Образование также прямо пропрционнально влияет на вероятность сдать

# В совокупности с имеющимися фичами хочется заюзать какую нибудь стратегию заполнения пропусков основанную на близости
# Думаю это может хорошо помочь

code_module_FFF                                 -1.066118
code_module_CCC                                 -0.647785
code_module_DDD                                 -0.607563
highest_education_No Formal quals               -0.478474
imd_band_0-10%                                  -0.443804
highest_education_Lower Than A Level            -0.428073
code_presentation_2013J                         -0.287481
code_presentation_2014J                         -0.259953
imd_band_10-20                                  -0.240458
region_London Region                            -0.236501
region_North Western Region                     -0.214500
num_of_prev_attempts                            -0.177279
disability_Y                                    -0.127521
region_West Midlands Region                     -0.121060
age_band_35-55                                  -0.088611
region_East Anglian Region                      -0.086814
region_Yorkshire Region                         -0.072995
imd_band_20-30

## Submission

In [81]:
_X_num_test = test_ds[num_cols(test_ds)]
_X_cat_test = test_ds[cat_cols(test_ds).drop(IDS_TO_DROP)]

_X_num_test = num_imputer.fit_transform(_X_num_test)
_X_cat_test = one_hot_encoder.fit_transform(
    cat_imputer.fit_transform(_X_cat_test)
)
_X_test = np.concatenate((_X_num_test, _X_cat_test), axis=1)



In [82]:
predictions = lr.predict(_X_test)
submit = test_ds[["ID"]].copy()
submit[TARGET] = predictions
submit = submit.set_index("ID")
submit.to_csv("submission.csv")

In [147]:
num_imputer = KNNImputer(n_neighbors=25, weights="distance")

cat_imputer = SimpleImputer(strategy="most_frequent")
one_hot_encoder = OneHotEncoder(drop="if_binary", handle_unknown="ignore", sparse=False)

_X_num = train_ds[num_cols(train_ds).drop(TARGET)]
_X_cat = train_ds[cat_cols(train_ds).drop(IDS_TO_DROP)]

lr = LogisticRegression(penalty="l2", fit_intercept=True, C=0.1, solver="newton-cholesky")

_X_num = num_imputer.fit_transform(_X_num)
_X_cat = one_hot_encoder.fit_transform(
    cat_imputer.fit_transform(_X_cat)
)
_X = np.concatenate((_X_num, _X_cat), axis=1)
lr.fit(_X, train_ds[TARGET].values)

validate_model(lr, _X, train_ds[TARGET].values)



f1_score:  0.9120423670274004
roc_auc_score:  0.9148838132155174
accuracy_score:  0.9120963411827875


In [148]:
_X_num_test = test_ds[num_cols(test_ds)]
_X_cat_test = test_ds[cat_cols(test_ds).drop(IDS_TO_DROP)]

_X_num_test = num_imputer.fit_transform(_X_num_test)
_X_cat_test = one_hot_encoder.fit_transform(
    cat_imputer.fit_transform(_X_cat_test)
)
_X_test = np.concatenate((_X_num_test, _X_cat_test), axis=1)

predictions = lr.predict(_X_test)
submit = test_ds[["ID"]].copy()
submit[TARGET] = predictions
submit = submit.set_index("ID")
submit.to_csv("submission.csv")

