# ĐỒ ÁN CUỐI KỲ SỐ 02: Dự đoán kết quả nhập môn lập trình từ dữ liệu nộp bài trên wecode.

In [2]:
import pandas as pd
import json
import ast
import numpy as np

from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer, PowerTransformer
from sklearn.linear_model import LinearRegression,  Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.base import BaseEstimator, RegressorMixin
from catboost import CatBoostRegressor
from mapie.regression import MapieRegressor

## Khám phá dữ liệu

In [3]:
anno_file = '/kaggle/input/cs114-final-project/annonimized.csv'
anno_data = pd.read_csv(anno_file)

# Đổi tên cho dễ dùng
rename_columns = {
    "concat('it001',`assignment_id`)": 'assignment_id',
    "concat('it001',`problem_id`)": 'problem_id',
    "concat('it001', username)": 'username',
    "concat('it001',`language_id`)": 'language_id'
}

anno_data.rename(columns=rename_columns, inplace=True)

In [4]:
anno_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295198 entries, 0 to 295197
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   assignment_id  295198 non-null  object
 1   problem_id     295198 non-null  object
 2   username       295198 non-null  object
 3   is_final       295198 non-null  int64 
 4   status         295198 non-null  object
 5   pre_score      295198 non-null  int64 
 6   coefficient    295198 non-null  int64 
 7   language_id    295198 non-null  object
 8   created_at     295198 non-null  object
 9   updated_at     295198 non-null  object
 10  judgement      295198 non-null  object
dtypes: int64(3), object(8)
memory usage: 24.8+ MB


In [5]:
# Tách thông tin từ cột judgement ra

def extract_judgement(row):
    data_string = row['judgement']
    try:
        data_dict = json.loads(data_string)
        times = data_dict.get('times', [])
        mems = data_dict.get('mems', [])
        verdicts = data_dict.get('verdicts', [])
        return pd.Series([times, mems, verdicts])
    except (json.JSONDecodeError, AttributeError):
        return pd.Series([None, None, None])

anno_data[['times', 'mems', 'verdicts']] = anno_data.apply(extract_judgement, axis = 1)
anno_data.drop(['judgement'], axis = 1, inplace=True)
print(anno_data.isna().any())

assignment_id    False
problem_id       False
username         False
is_final         False
status           False
pre_score        False
coefficient      False
language_id      False
created_at       False
updated_at       False
times             True
mems              True
verdicts          True
dtype: bool


In [6]:
# Kiểm tra một số lỗi trong verdicts
anno_data['verdicts'] = anno_data['verdicts'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
exploded_verdicts = anno_data['verdicts'].explode()
unique_verdicts = exploded_verdicts.unique()
print(unique_verdicts)
# Thông tin 'verdicts' sẽ khác nhau dựa trên 'status'
print(f"Unique status values: {anno_data['status'].unique()}")

['WRONG' nan
 "\n\nIn function 'int main()':\n17:2: error: 'a' was not declared in this scope\n"
 ...
 "\n\nIn function 'int main()':\n13:11: error: 'fasle' was not declared in this scope\n"
 "\n\nIn function 'int myStrlen(char*)':\n24:14: error: 'len' was not declared in this scope\n"
 "\n\n71:1: error: 'Viết' does not name a type\n"]
Unique status values: ['SCORE' 'Compilation Error' 'pending' 'Syntax Error']


## Feature Engineering

### Judgement Features

In [11]:
# Giảm số lỗi trong verdicts
def simplify_error_verdicts(row):
    if row['status'] == 'Compilation Error':
        row['verdicts'] = {'Compilation Error': -1}
    elif row['status'] == 'Syntax Error':
        row['verdicts'] = {'Syntax Error': -1}
    return row['verdicts']
anno_data['verdicts'] = anno_data.apply(simplify_error_verdicts, axis=1)


In [12]:
# Lấy số verdicts được chấm điểm nhưng có test case bị sai
def get_wrong(row):
    data = row['verdicts']
    return (json.dumps(data).replace('"', "").replace('{', "").replace('}', "").split(":")[0] == 'WRONG')
anno_data['WRONG'] = anno_data.apply(get_wrong, axis=1)

### Time-based Features

In [13]:
# Thêm năm vào ngày, vì tồn tại ngày 29 - 2 => Phải là năm nhuận
def add_year(date):
    month = int(date.split('-')[0])
    year = '2019' if month >= 8 else '2020' # Năm nhuận gần nhất
    return f'{year}-{date}'

anno_data['created_at'] = pd.to_datetime(anno_data['created_at'].apply(add_year))
anno_data['updated_at'] = pd.to_datetime(anno_data['updated_at'].apply(add_year))
# Chuyển `created_at` và `updated_at` thành dạng số
ref_date = pd.Timestamp('2019-01-01')
anno_data['time_to_solve'] = (anno_data['updated_at'] - anno_data['created_at']).dt.total_seconds()
anno_data['numeric_created_at'] = (anno_data['created_at'] - ref_date).dt.days
anno_data['numeric_updated_at'] = (anno_data['updated_at'] - ref_date).dt.days

In [14]:
anno_data['date'] = anno_data['created_at']
# Lấy các giá trị liên quan ngày tháng
anno_data['month'] = anno_data['date'].dt.month
anno_data['hour_of_day'] = anno_data['created_at'].dt.hour
anno_data['day_of_week'] = anno_data['created_at'].dt.dayofweek
anno_data['day_of_month'] = anno_data['created_at'].dt.day

# Chia các cột tháng ra
month_dummies = pd.get_dummies(anno_data['month'], prefix='month')
anno_data = pd.concat([anno_data, month_dummies], axis=1)
for column in month_dummies.columns:
    anno_data[column] = anno_data.groupby('username')[column].transform('sum')

In [15]:
# Thời giản giải mỗi problem
def time_to_solve(row):
    if row["status"] == "SCORE" and row["is_final"] == 1:
        return (row["updated_at"] - row["created_at"]).total_seconds()
    else:
        return 0
anno_data["time_to_solve_seconds"] = anno_data.apply(time_to_solve, axis=1)
time_to_solve_stats = anno_data.groupby("username")["time_to_solve_seconds"].agg(["mean", "std"])
time_to_solve_stats.rename(columns={"mean": "avg_time_to_solve", "std": "std_time_to_solve"}, inplace=True)
anno_data = anno_data.merge(time_to_solve_stats, on="username", how="left")

### Đặc trưng mỗi sinh viên

- `num_assignments`: Số assignments mỗi sinh viên đã làm
- `num_problems`: Số problem của mỗi sinh viên đã làm
- `num_submissons`: Số lần submit của mỗi sinh viên
- `num_is_final`: Số bài nộp là final của mỗi sinh viên
- `num_status_SCORE`: Số bài nộp được chấm điểm của mỗi sinh viên
- `num_status_COMPILATION_ERROR`: Số bài nộp có lỗi compilation của mỗi sinh viên
- `num_status_PENDING`: Số bài nộp pending của mỗi sinh viên
- `num_status_SYNTAX_ERROR`: Số bài nộp có lỗi syntax của mỗi sinh viên
- `num_WRONG`: Số bài nộp có test case sai của mỗi sinh viên
- `num_late_submissions`: Số bài nộp trễ của mỗi sinh viên
- `mean_pre_score`: Trung bình `pre_score` của mỗi sinh viên
- `mean_coefficent`: Trung bình `coefficient` của mỗi sinh viên
- `std_pre_score`: Độ lệch chuẩn `pre_score` của mỗi sinh viên
- `std_coefficient`: Độ lệch chuẩn `coefficient` của mỗi sinh viên
- `mean_num_created_at`: Trung bình ngày nộp bài của mỗi sinh viên (Không cần?)
- `mean_num_updated_at`: Trung bình ngày chấm bài của mỗi sinh viên (Không cần?)
- `mean_submit_hour`: Giờ trong ngày trung bình của mỗi sinh viên
- `mean_submit_day`: Ngày trong tuần nộp bài trung bình của mỗi sinh viên

In [16]:

features = anno_data.groupby('username').agg(
    num_assignments = ('assignment_id', 'nunique'),
    num_problems = ('problem_id', 'nunique'),
    num_submissions = ('username', 'count'),
    num_is_final=('is_final', 'sum'),
    num_status_SCORE=('status', lambda x: (x == 'SCORE').sum()),
    num_status_COMPILATION_ERROR=('status', lambda x: (x == 'Compilation Error').sum()),
    num_status_PENDING=('status', lambda x: (x == 'pending').sum()),
    num_status_SYNTAX_ERROR=('status', lambda x: (x == 'Syntax Error').sum()),
    num_WRONG=('WRONG', 'sum'),
    num_late_submissions=('coefficient', lambda x: (x < 100).sum()),
    mean_pre_score=('pre_score', 'mean'),
    mean_coefficient=('coefficient', 'mean'),
    std_pre_score=('pre_score', 'std'),
    std_coefficient=('coefficient', 'std'),
    mean_num_created_at=('numeric_created_at', 'mean'),
    mean_num_updated_at=('numeric_updated_at', 'mean'),
    mean_submit_hour=('hour_of_day', 'mean'),
    mean_submit_day=('day_of_week', 'mean'),
).reset_index()


In [17]:
num_submits_per_assignment = anno_data.groupby(["username", "assignment_id"])["problem_id"].count().reset_index(name="num_submits_per_assignment")
num_submits_per_assignment = num_submits_per_assignment.groupby("username")["num_submits_per_assignment"].agg(["mean", "max", "min", "std"]).add_prefix("num_submits_per_assignment_")
num_submits_per_problem = anno_data.groupby(["username", "problem_id"])["assignment_id"].count().reset_index(name="num_submits_per_problem")
num_submits_per_problem = num_submits_per_problem.groupby("username")["num_submits_per_problem"].agg(["mean", "max", "min", "std"]).add_prefix("num_submits_per_problem_")

features = features.merge(num_submits_per_assignment, on="username", how="left")
features = features.merge(num_submits_per_problem, on="username", how="left")

In [18]:
# Số lần đạt điểm tối đa (`pre_score == 10000`)
full_score_count = anno_data[(anno_data['is_final']) & (anno_data['pre_score'] == 10000)]['username'].value_counts().reset_index()
full_score_count.columns = ['username', 'num_full_score']

total_scores = (
    anno_data[
        (anno_data['is_final']) & (anno_data['pre_score'] == 10000)
    ]
    .groupby(['username', 'problem_id'])
    .agg(
        pre_score=('pre_score', 'first'),
        coefficient=('coefficient', 'first')
    )
    .reset_index()
    .assign(
        score=lambda x: (x['pre_score'] / 100) * (x['coefficient'] / 100)
    )
    .groupby('username')['score'].sum().reset_index(name='total_score')
)

features = features.merge(
    full_score_count, on='username', how='left'
)

features = features.merge(total_scores, on='username', how='left')

features['num_full_score'] = features['num_full_score'].fillna(0).astype(int)
features['total_score'] = features['total_score'].fillna(0).astype(int)

In [19]:
bins = [0, 3, 6, 9, 12, 15, 18, 21, 24]
labels = ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '18-20', '21-23']
anno_data['time_slot'] = pd.cut(anno_data['hour_of_day'], bins=bins, labels=labels, right=True)
for label in labels:
    anno_data[label] = (anno_data['time_slot'] == label).astype(int)
    anno_data[label] = anno_data.groupby('username')[label].transform('sum')

anno_data = anno_data.drop(columns=['time_slot', 'date'])
anno_data

Unnamed: 0,assignment_id,problem_id,username,is_final,status,pre_score,coefficient,language_id,created_at,updated_at,...,avg_time_to_solve,std_time_to_solve,0-2,3-5,6-8,9-11,12-14,15-17,18-20,21-23
0,90ce27571176d87961b565d5ef4b3de33ede04ac,789454427dd4097a14749e3dde63346b7a8d3811,ed9eaeb6a707f50154024b24d7efcb874a9795dd,0,SCORE,0,100,it0012,2019-10-09 08:02:04,2019-10-09 08:06:58,...,3.784173,4.873416,8,20,47,26,17,15,0,0
1,90ce27571176d87961b565d5ef4b3de33ede04ac,789454427dd4097a14749e3dde63346b7a8d3811,ed9eaeb6a707f50154024b24d7efcb874a9795dd,0,SCORE,0,100,it0012,2019-10-09 08:04:41,2019-10-09 08:04:51,...,3.784173,4.873416,8,20,47,26,17,15,0,0
2,90ce27571176d87961b565d5ef4b3de33ede04ac,789454427dd4097a14749e3dde63346b7a8d3811,ed9eaeb6a707f50154024b24d7efcb874a9795dd,1,SCORE,10000,100,it0012,2019-10-09 08:06:49,2019-10-09 08:06:58,...,3.784173,4.873416,8,20,47,26,17,15,0,0
3,90ce27571176d87961b565d5ef4b3de33ede04ac,bf96fbdc5f499538c3e2bfbec5779c8a14b0a9ff,ed9eaeb6a707f50154024b24d7efcb874a9795dd,1,SCORE,10000,100,it0012,2019-10-09 08:47:52,2019-10-09 08:48:01,...,3.784173,4.873416,8,20,47,26,17,15,0,0
4,90ce27571176d87961b565d5ef4b3de33ede04ac,7a6e5ca470ff47c3b5048f240c4738de71010c78,ed9eaeb6a707f50154024b24d7efcb874a9795dd,1,SCORE,10000,100,it0012,2019-10-09 09:19:35,2019-10-09 09:19:45,...,3.784173,4.873416,8,20,47,26,17,15,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295193,613aea04c978f5e72fffc8bcff1f7b695a63f7b1,388516cbf597351226be1bdbe5ef30b9dcef570f,232cce96362898f08e9150ba244adaf2d6583ab2,1,SCORE,10000,100,it0012,2020-01-15 16:03:43,2020-01-15 16:03:53,...,3.820175,5.165809,17,14,49,28,73,34,12,1
295194,613aea04c978f5e72fffc8bcff1f7b695a63f7b1,d2b96124ccb8e27b4b8dacdb935e729cb1ba546b,232cce96362898f08e9150ba244adaf2d6583ab2,0,Compilation Error,0,100,it0012,2020-01-15 16:04:07,2020-01-15 16:05:08,...,3.820175,5.165809,17,14,49,28,73,34,12,1
295195,613aea04c978f5e72fffc8bcff1f7b695a63f7b1,d2b96124ccb8e27b4b8dacdb935e729cb1ba546b,232cce96362898f08e9150ba244adaf2d6583ab2,1,SCORE,10000,100,it0012,2020-01-15 16:04:58,2020-01-15 16:05:08,...,3.820175,5.165809,17,14,49,28,73,34,12,1
295196,613aea04c978f5e72fffc8bcff1f7b695a63f7b1,8c0f8dd4ff55e1609f733e043ac5e88b1dde6e7c,232cce96362898f08e9150ba244adaf2d6583ab2,1,SCORE,10000,100,it0012,2020-01-15 16:05:13,2020-01-15 16:05:22,...,3.820175,5.165809,17,14,49,28,73,34,12,1


### Đặc trưng Assignment và Problem

In [20]:
assignments_per_problem = anno_data.groupby('problem_id')['assignment_id'].unique()
# Tính số lượng sinh viên đã tham gia mỗi assignment_id
students_per_assignment = anno_data.groupby('assignment_id')['username'].nunique()
# Tính số lượng sinh viên đã tham gia mỗi problem_id thông qua các assignment_id
students_per_problem = assignments_per_problem.apply(lambda x: students_per_assignment.loc[x].sum())
# Tính số lượng sinh viên đã giải được mỗi problem_id
final_solutions = anno_data[(anno_data['is_final'] == 1) & (anno_data['pre_score'] == 10000)]
solved_counts = final_solutions.groupby('problem_id')['username'].nunique()

assignment_features = pd.DataFrame({
    'problem_id': students_per_problem.index,
    'participated': students_per_problem.values,
    'solved': solved_counts.reindex(students_per_problem.index, fill_value=0).values,
})


### Ratio Features

- `ratio_final_submissions`: Tỉ lệ bài nộp là final trên số lần nộp
- `ratio_error_submissions`: Tỉ lệ bài nộp là lỗi trên số lần nộp
- `ratio_late_submissions`: Tỉ lệ bài nộp trễ trên trên số lần nộp
- `ratio_pending_submissions`: Tỉ lệ bài nộp có tình trạng pending trên số lần nộp
- `ratio_scores_submissions`: Tỉ lệ bài nộp được chấm điểm trên số lần nộp
- `ratio_final_scores`: Tỉ lệ bài nộp là final trên số lần được chấm điểm
- `ratio_full_scores`: Tỉ lệ bài nộp được full điểm trên số problems
- `ratio_submissions_problems`: Tỉ lệ nộp bài trên số problems
- `ratio_assignment_done`: Tỉ lệ bài nộp được chấm điểm trên số assignment
- `ratio_problems_done`: Tỉ lệ problems đã làm trên tổng số problems
- `ratio_wrong_problems`: Tỉ lệ bài nộp bị wrong trên tổng số problems
- `ratio_wrong_assignments`: Tỉ lệ bài nộp bị wrong trên tổng số assignments
- `ratio_wrong_scores`: Tỉ lệ bài nộp bị wrong trên số lần được chấm điểm

In [21]:
features['ratio_final_submissions'] = features["num_is_final"] / (features["num_submissions"] + 10**-9)
features['ratio_error_submissions'] = (features["num_status_COMPILATION_ERROR"] + features["num_status_SYNTAX_ERROR"]) / (features["num_submissions"] + 10**-9)
features['ratio_late_submissions'] = features["num_late_submissions"] / (features['num_submissions'] + 10**-9)
features['ratio_pending_submissions'] = features["num_status_PENDING"]/ (features["num_submissions"] + 10**-9)
features['ratio_scores_submissions'] = features["num_status_SCORE"] / (features["num_submissions"] + 10**-9)
features['ratio_final_scores'] = features["num_is_final"] / (features["num_status_SCORE"] + 10**-9)
features['ratio_full_scores'] = features["num_full_score"] / (features["num_problems"] + 10**-9)
features['ratio_scores_submissions'] = features["num_status_SCORE"] / (features["num_submissions"] + 10**-9)
features["ratio_submissions_assignments"] = features["num_submissions"] / (features["num_assignments"] + 10**-9)
features["ratio_submissions_problems"] = features["num_submissions"] / (features["num_problems"] + 10**-9)
features["ratio_scores_assignments"] = features["num_status_SCORE"] / (features["num_assignments"] + 10**-9)
features["ratio_scores_problems"] = features["num_status_SCORE"] / (features["num_problems"] + 10**-9)
features['ratio_assignment_done'] = features["num_assignments"] / (features["num_assignments"].max() + 10**-9)
features['ratio_problems_done'] = features["num_problems"] / (features["num_problems"].max() + 10**-9)
features['ratio_wrong_problems'] = features['num_WRONG'] / (features["num_problems"] + 10**-9)
features['ratio_wrong_assignments'] = features['num_WRONG'] / (features["num_assignments"] + 10**-9)
features['ratio_wrong_scores'] = features['num_WRONG'] / (features["num_status_SCORE"] + 10**-9)


In [22]:
anno_data = anno_data.merge(features, on='username', how='left')

## Đặc trưng theo độ khó
Nếu tỉ lệ làm được bài của tất cả sinh viên :
- `>= 0.7` thì bài tập đó được xếp vào mức dễ: `easy`
- `>= 0.4` thì bài tập đó được xếp vào mức trung bình: `medium`
- `>= 0.2` thì bài tập đó được xếp vào mức khó: `hard`
- `>= 0.1` thì bài tập đó được xếp vào mức siêu khó: `super_hard`
- Còn lại thì bài tập đó được xếp vào mức hủy diệt khó: `deadly_hard`

In [23]:
def difficult_ranking(ratio):
    if ratio >= 0.7:
        return 'easy'
    elif ratio >= 0.4:
        return 'medium'
    elif ratio >= 0.2:
        return 'hard'
    elif ratio >= 0.1:
        return 'super_hard'
    else:
        return 'deadly_hard'
assignment_features["ratio_solved_participated"] = assignment_features["solved"] / assignment_features["participated"]
assignment_features['difficulty'] = assignment_features['ratio_solved_participated'].apply(difficult_ranking)

In [24]:
anno_data = anno_data.merge(assignment_features[['problem_id', 'difficulty']], on='problem_id', how='left')
ac_submissions = anno_data[(anno_data['is_final'] == 1) & (anno_data['pre_score'] == 10000)]
diff_counts = ac_submissions.groupby(['username', 'difficulty']).size().unstack(fill_value=0).reset_index()
anno_data = anno_data.merge(diff_counts, on='username', how='left')
cols = ['easy', 'medium', 'hard', 'super_hard', 'deadly_hard',]
for col in cols:
    anno_data[col] = anno_data[col].fillna(0).astype(int)
anno_data.drop(columns='difficulty', inplace=True)

### One-hot-encoding
One-hot encoding cho các cột như `problem_id`, `assignment_id`. 
Việc One-hot encoding cho cột `language_id` cho kết quả tệ hơn

In [25]:
problem_encoder = OneHotEncoder(sparse_output=False)
anno_data['problem_id'] = anno_data['problem_id'].str.strip()
# Thực hiện fit_transform trên cột 'problem_id'
problem_encoded_columns = problem_encoder.fit_transform(anno_data[['problem_id']])
# Lấy tên các cột sau khi encoding và thêm tiền tố 'problem__'
problem_encoded_column_names = ['problem__' + cat for cat in problem_encoder.categories_[0]]
# Tạo DataFrame cho các cột đã được One-Hot problem_encoder
problem_encoder_df = pd.DataFrame(problem_encoded_columns, columns=problem_encoded_column_names, dtype=int)
# Kết hợp DataFrame ban đầu với các cột mới
anno_data = pd.concat([anno_data, problem_encoder_df], axis=1)

## Không quan trọng, thậm chí còn làm mất điểm.
# language_encoder = OneHotEncoder(sparse_output=False)
# anno_data['language_id'] = anno_data['language_id'].str.strip()
# # Thực hiện fit_transform trên cột 'assignment_id'
# language_encoded_columns = language_encoder.fit_transform(anno_data[['language_id']])
# # Lấy tên các cột sau khi encoding và thêm tiền tố 'in__'
# language_encoded_column_names = ['language__' + cat for cat in language_encoder.categories_[0]]
# # Tạo DataFrame cho các cột đã được One-Hot Encoded
# language_encoder_df = pd.DataFrame(language_encoded_columns, columns=language_encoded_column_names, dtype=int)
# # Kết hợp DataFrame ban đầu với các cột mới
# anno_data = pd.concat([anno_data, language_encoder_df], axis=1)

assignment_encoder = OneHotEncoder(sparse_output=False)
anno_data['assignment_id'] = anno_data['assignment_id'].str.strip()
# Thực hiện fit_transform trên cột 'assignment_id'
assignment_encoded_columns = assignment_encoder.fit_transform(anno_data[['assignment_id']])
# Lấy tên các cột sau khi encoding và thêm tiền tố 'assignment_id__'
assignment_encoded_column_names = ['assignment__' + cat for cat in assignment_encoder.categories_[0]]
# Tạo DataFrame cho các cột đã được One-Hot Encoded
assignment_encoder_df = pd.DataFrame(assignment_encoded_columns, columns=assignment_encoded_column_names, dtype=int)
# Kết hợp DataFrame ban đầu với các cột mới
anno_data = pd.concat([anno_data, assignment_encoder_df], axis=1)

## Dự đoán điểm

In [26]:
tbtl_file = '/kaggle/input/cs114-final-project/tbtl-public.csv'
qt_file = '/kaggle/input/cs114-final-project/qt-public.csv'
th_file = '/kaggle/input/cs114-final-project/th-public.csv'
ck_file = '/kaggle/input/cs114-final-project/ck-public.csv'

tbtl_data = pd.read_csv(tbtl_file)
qt_data = pd.read_csv(qt_file)
th_data = pd.read_csv(th_file)
ck_data = pd.read_csv(ck_file)
# Chuyển cột điểm sang dạng số
th_data.iloc[:, 1] = pd.to_numeric(th_data.iloc[:, 1], errors='coerce')
tbtl_data.iloc[:, 1] = pd.to_numeric(tbtl_data.iloc[:, 1], errors='coerce')
ck_data.iloc[:, 1] = pd.to_numeric(ck_data.iloc[:, 1], errors='coerce')
qt_data.iloc[:, 1] = pd.to_numeric(qt_data.iloc[:, 1], errors='coerce')

In [27]:
print(f"TBTL NaN values: {tbtl_data['TBTL'].isna().sum()}")
print(f"QT NaN values: {qt_data['diemqt'].isna().sum()}")
print(f"TH NaN values: {th_data['TH'].isna().sum()}")
print(f"CK NaN values: {ck_data['CK'].isna().sum()}")

TBTL NaN values: 0
QT NaN values: 7
TH NaN values: 8
CK NaN values: 6


In [28]:
qt_data['diemqt'] = qt_data['diemqt'].fillna(0)
th_data['TH'] = th_data['TH'].fillna(0)
ck_data['CK'] = ck_data['CK'].fillna(0)

In [29]:
tbtl_data.rename(columns={"hash": "username"}, inplace=True)
qt_data.rename(columns={"hash": "username"}, inplace=True)
th_data.rename(columns={"hash": "username"}, inplace=True)
ck_data.rename(columns={"hash": "username"}, inplace=True)

In [30]:
def create_train_predict_df(df, anno_df):
    train_df = df.copy()
    username = train_df['username']
    anno_username = anno_df['username']
    # Tìm các giá trị "username" có trong annonimized_df nhưng không có trong df_th - cần predict
    unique_username = anno_username[~anno_username.isin(username)]
    # Lọc các hàng trong annonimized_df mà có "username" nằm trong unique_username
    fil_df = anno_df[anno_df['username'].isin(unique_username)]
    unique_values = fil_df['username'].unique()
    predict_df = pd.DataFrame(unique_values, columns=['username'])
    predict_col = df.columns[1]
    train_df.rename(columns=rename_columns, inplace=True)
    ac_summary = anno_df.groupby('username').first().reset_index()
    train_df = pd.merge(train_df, ac_summary, on='username', how='left')
    predict_df = pd.merge(predict_df, ac_summary, on='username', how='left')
    drop_col = ['assignment_id', 'problem_id', 'is_final', 'status', 'pre_score',
                'coefficient', 'language_id', 'created_at', 'updated_at',
                'times', 'mems', 'verdicts', 'WRONG']
    train_df.drop(columns=drop_col, inplace=True)
    predict_df.drop(columns=drop_col, inplace=True)
    train_df.dropna(subset=predict_col, inplace=True)
    return train_df, predict_df

train_df_TL, predict_df_TL = create_train_predict_df(tbtl_data, anno_data)
train_df_QT, predict_df_QT = create_train_predict_df(qt_data, anno_data)
train_df_TH, predict_df_TH = create_train_predict_df(th_data, anno_data)
train_df_CK, predict_df_CK = create_train_predict_df(ck_data, anno_data)




In [31]:
# Lọc cột có giá trị NaN
columns_with_nan = train_df_TL.columns[train_df_TL.isna().any()].tolist()
print(columns_with_nan)

['std_time_to_solve', 'std_pre_score', 'std_coefficient', 'num_submits_per_assignment_std', 'num_submits_per_problem_std']


In [32]:
train_df_TL[columns_with_nan] = train_df_TL[columns_with_nan].fillna(0)
train_df_QT[columns_with_nan] = train_df_QT[columns_with_nan].fillna(0)
train_df_TH[columns_with_nan] = train_df_TH[columns_with_nan].fillna(0)
train_df_CK[columns_with_nan] = train_df_CK[columns_with_nan].fillna(0)

predict_df_TL[columns_with_nan] = predict_df_TL[columns_with_nan].fillna(0)
predict_df_QT[columns_with_nan] = predict_df_QT[columns_with_nan].fillna(0)
predict_df_TH[columns_with_nan] = predict_df_TH[columns_with_nan].fillna(0)
predict_df_CK[columns_with_nan] = predict_df_CK[columns_with_nan].fillna(0)


In [33]:
base_columns_to_drop = [
    'std_time_to_solve', 'avg_time_to_solve',
    'ratio_wrong_problems', 'ratio_wrong_assignments', 'ratio_wrong_scores',
    'ratio_pending_submissions', 'num_submits_per_assignment_min',
    'ratio_error_submissions'
]

columns_to_drop_TL = base_columns_to_drop + ['num_WRONG']
columns_to_drop_QT = base_columns_to_drop + [
    'num_status_SYNTAX_ERROR',
    'num_status_COMPILATION_ERROR',
    'num_status_PENDING',
    'num_submits_per_assignment_std',
    'mean_submit_day', 'mean_submit_hour',
    'mean_num_updated_at', 'mean_num_created_at',
    'num_WRONG', 
    'num_submits_per_assignment_max',
    'num_submits_per_problem_std', 
    'numeric_updated_at'
]
columns_to_drop_TH = base_columns_to_drop + [
    'num_status_SYNTAX_ERROR',
    'num_status_COMPILATION_ERROR',
    'num_status_PENDING',
    'num_submits_per_problem_std', 'num_submits_per_problem_min',
    'num_submits_per_problem_max', 'num_submits_per_problem_mean',
    'num_submits_per_assignment_std',
    'num_submits_per_assignment_max', 'num_submits_per_assignment_mean',
    'hour_of_day', 'day_of_week', 'day_of_month',
    'num_assignments'
]

columns_to_drop_CK = base_columns_to_drop + []  
train_df_TL = train_df_TL.drop(columns=columns_to_drop_TL)
train_df_QT = train_df_QT.drop(columns=columns_to_drop_QT)
train_df_TH = train_df_TH.drop(columns=columns_to_drop_TH)
train_df_CK = train_df_CK.drop(columns=columns_to_drop_CK)

predict_df_TL = predict_df_TL.drop(columns=columns_to_drop_TL)
predict_df_QT = predict_df_QT.drop(columns=columns_to_drop_QT)
predict_df_TH = predict_df_TH.drop(columns=columns_to_drop_TH)
predict_df_CK = predict_df_CK.drop(columns=columns_to_drop_CK)

In [34]:
models = [
    ('ridge', Ridge(), {
        'regressor__alpha': [2, 3, 4, 5, 6, 10]
    }),
    ('elastic_net', ElasticNet(max_iter=500000), {
        'regressor__alpha': [0.01, 0.1, 0.3],
        'regressor__l1_ratio': [0.2, 0.4, 0.6, 0.8]
    }),
    ('svr', SVR(), {
        'regressor__C': [2.0, 4.0, 5.0, 6.0 ,7.0],
        'regressor__kernel': ['rbf'],
        'regressor__epsilon': [0.2, 0.4, 0.6, 0.8, 1]
    }),
    ('random_forest', RandomForestRegressor(random_state=42), {
        'regressor__n_estimators': [50, 70, 100],
        'regressor__max_depth': [None, 3, 5, 7, 10]
    }),
    #7
    ('knn_regressor', KNeighborsRegressor(), {
        'regressor__n_neighbors': [10, 15, 20],
        'regressor__weights': ['distance']
    }),
    ('catboost', CatBoostRegressor(random_state=42, verbose=0, task_type="GPU", devices='0'), {
        'regressor__iterations': [400, 500],
        'regressor__learning_rate': [0.03, 0.05],
        'regressor__depth': [8, 10],
        'regressor__l2_leaf_reg': [1, 3, 5]
    }),
    ('mapie', MapieRegressor(
        estimator=GradientBoostingRegressor(random_state=42),
        cv=5,
        method="plus"
    ), {
        'regressor__estimator__n_estimators': [100, 200],
        'regressor__estimator__max_depth': [5, 6],
    }),
]

In [35]:
class WeightedVotingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, models, weights):
        self.models = models
        self.weights = weights

    def predict(self, X):
        preds = np.array([model.predict(X) for model in self.models])
        weighted_predictions = np.average(preds, axis=0, weights=self.weights)
        return weighted_predictions

def search_model(X, y, random_seed, models):
    best_models = []
    i = 0
    for name, model, params in models:
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('regressor', model)
        ])

        param_grid = {
            'scaler': [StandardScaler(), MinMaxScaler(), None]
        }
        param_grid.update(params)
        kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)
        grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='r2', n_jobs=1, verbose = 1)
        grid_search.fit(X, y)

        cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=kf, scoring='r2')
        mean_cv_score = np.mean(cv_scores)

        i += 1
        print(f"#{i} Model: {name}")
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Mean cross-validation score: {mean_cv_score}")
        best = grid_search.best_estimator_
        best_models.append(best)
        print('-' * 30)

    return best_models

In [36]:
search_model(train_df_TL.drop(columns=[train_df_TL.columns[1], 'username']), train_df_TL[train_df_TL.columns[1]], 42, models)

Fitting 5 folds for each of 18 candidates, totalling 90 fits




#1 Model: ridge
Best parameters: {'regressor__alpha': 6, 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.21171716888403944
------------------------------
Fitting 5 folds for each of 36 candidates, totalling 180 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


#2 Model: elastic_net
Best parameters: {'regressor__alpha': 0.01, 'regressor__l1_ratio': 0.2, 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.19883845891276114
------------------------------
Fitting 5 folds for each of 75 candidates, totalling 375 fits
#3 Model: svr
Best parameters: {'regressor__C': 2.0, 'regressor__epsilon': 0.6, 'regressor__kernel': 'rbf', 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.24685953306446873
------------------------------
Fitting 5 folds for each of 45 candidates, totalling 225 fits
#4 Model: random_forest
Best parameters: {'regressor__max_depth': None, 'regressor__n_estimators': 100, 'scaler': StandardScaler()}
Mean cross-validation score: 0.18301303094758958
------------------------------
Fitting 5 folds for each of 9 candidates, totalling 45 fits
#5 Model: knn_regressor
Best parameters: {'regressor__n_neighbors': 15, 'regressor__weights': 'distance', 'scaler': StandardScaler()}
Mean cross-validation score: 0.1646386999532591
--------

[Pipeline(steps=[('scaler', MinMaxScaler()), ('regressor', Ridge(alpha=6))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor',
                  ElasticNet(alpha=0.01, l1_ratio=0.2, max_iter=500000))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor', SVR(C=2.0, epsilon=0.6))]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor', RandomForestRegressor(random_state=42))]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor',
                  KNeighborsRegressor(n_neighbors=15, weights='distance'))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor',
                  <catboost.core.CatBoostRegressor object at 0x7ed06e1c9360>)]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor',
                  MapieRegressor(cv=5,
                                 estimator=GradientBoostingRegressor(max_depth=5,
                                   

In [37]:
X_TBTL = train_df_TL.drop(columns=[train_df_TL.columns[1], 'username'])
y_TBTL = train_df_TL[train_df_TL.columns[1]]

model_TBTL1 = Pipeline(steps=[('scaler', MinMaxScaler()),
                            ('regressor', SVR(C=5.0, epsilon=0.4))])
model_TBTL1.fit(X_TBTL, y_TBTL)

model_TBTL2 =  Pipeline(steps=[('scaler', None),
                 ('regressor', CatBoostRegressor(depth=9, l2_leaf_reg=1, iterations=400, learning_rate=0.03, verbose = 0, task_type="GPU", devices='0'))])
model_TBTL2.fit(X_TBTL, y_TBTL)

list_model_TBTL = [model_TBTL1, model_TBTL2]
weights_TBTL = [0.4, 0.8]
voting_TBTL = WeightedVotingRegressor(models=list_model_TBTL, weights=weights_TBTL)

In [38]:
search_model(train_df_QT.drop(columns=[train_df_QT.columns[1], 'username']), train_df_QT[train_df_QT.columns[1]], 42, models)

Fitting 5 folds for each of 18 candidates, totalling 90 fits




#1 Model: ridge
Best parameters: {'regressor__alpha': 5, 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.2955278151196154
------------------------------
Fitting 5 folds for each of 36 candidates, totalling 180 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


#2 Model: elastic_net
Best parameters: {'regressor__alpha': 0.01, 'regressor__l1_ratio': 0.4, 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.30179215902993867
------------------------------
Fitting 5 folds for each of 75 candidates, totalling 375 fits
#3 Model: svr
Best parameters: {'regressor__C': 7.0, 'regressor__epsilon': 0.4, 'regressor__kernel': 'rbf', 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.32657010716776314
------------------------------
Fitting 5 folds for each of 45 candidates, totalling 225 fits
#4 Model: random_forest
Best parameters: {'regressor__max_depth': None, 'regressor__n_estimators': 50, 'scaler': StandardScaler()}
Mean cross-validation score: 0.34833217420096546
------------------------------
Fitting 5 folds for each of 9 candidates, totalling 45 fits
#5 Model: knn_regressor
Best parameters: {'regressor__n_neighbors': 10, 'regressor__weights': 'distance', 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.3288588062919497
-----------

[Pipeline(steps=[('scaler', MinMaxScaler()), ('regressor', Ridge(alpha=5))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor',
                  ElasticNet(alpha=0.01, l1_ratio=0.4, max_iter=500000))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor', SVR(C=7.0, epsilon=0.4))]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor',
                  RandomForestRegressor(n_estimators=50, random_state=42))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor',
                  KNeighborsRegressor(n_neighbors=10, weights='distance'))]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor',
                  <catboost.core.CatBoostRegressor object at 0x7ed0703d1c30>)]),
 Pipeline(steps=[('scaler', None),
                 ('regressor',
                  MapieRegressor(cv=5,
                                 estimator=GradientBoostingRegressor(max_depth=6,
          

In [39]:
X_QT = train_df_QT.drop(columns=[train_df_QT.columns[1], 'username'])
y_QT = train_df_QT[train_df_QT.columns[1]]

model_QT1 = Pipeline(steps=[('scaler', MinMaxScaler()),
                            ('regressor', SVR(C=5.0, epsilon=0.4))])
model_QT1.fit(X_QT, y_QT)

model_QT2 = Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor',
                  MapieRegressor(cv=5,
                                 estimator=GradientBoostingRegressor(random_state=42, n_estimators=400, learning_rate=0.05)))])
model_QT2.fit(X_QT, y_QT)


model_QT3 =  Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor',
                  RandomForestRegressor(n_estimators=100, random_state=42, min_samples_leaf=6))])

model_QT3.fit(X_QT, y_QT)

model_QT4 =  Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor',
                  KNeighborsRegressor(n_neighbors=21, weights='distance'))])
model_QT4.fit(X_QT, y_QT)

list_model_QT = [model_QT1, model_QT2, model_QT3, model_QT4]
weights_QT = [0.4, 0.3, 0.2, 0.2]
voting_QT = WeightedVotingRegressor(models=list_model_QT, weights=weights_QT)

In [40]:
search_model(train_df_TH.drop(columns=[train_df_TH.columns[1], 'username']), train_df_TH[train_df_TH.columns[1]], 42, models)

Fitting 5 folds for each of 18 candidates, totalling 90 fits




#1 Model: ridge
Best parameters: {'regressor__alpha': 5, 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.4042092177965909
------------------------------
Fitting 5 folds for each of 36 candidates, totalling 180 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


#2 Model: elastic_net
Best parameters: {'regressor__alpha': 0.01, 'regressor__l1_ratio': 0.2, 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.4021465110974802
------------------------------
Fitting 5 folds for each of 75 candidates, totalling 375 fits
#3 Model: svr
Best parameters: {'regressor__C': 7.0, 'regressor__epsilon': 0.6, 'regressor__kernel': 'rbf', 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.4365971440553523
------------------------------
Fitting 5 folds for each of 45 candidates, totalling 225 fits
#4 Model: random_forest
Best parameters: {'regressor__max_depth': None, 'regressor__n_estimators': 70, 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.4011572757001801
------------------------------
Fitting 5 folds for each of 9 candidates, totalling 45 fits
#5 Model: knn_regressor
Best parameters: {'regressor__n_neighbors': 15, 'regressor__weights': 'distance', 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.34332004510236275
---------------

[Pipeline(steps=[('scaler', MinMaxScaler()), ('regressor', Ridge(alpha=5))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor',
                  ElasticNet(alpha=0.01, l1_ratio=0.2, max_iter=500000))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor', SVR(C=7.0, epsilon=0.6))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor',
                  RandomForestRegressor(n_estimators=70, random_state=42))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor',
                  KNeighborsRegressor(n_neighbors=15, weights='distance'))]),
 Pipeline(steps=[('scaler', None),
                 ('regressor',
                  <catboost.core.CatBoostRegressor object at 0x7ed082a0b460>)]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor',
                  MapieRegressor(cv=5,
                                 estimator=GradientBoostingRegressor(max_depth=6,
              

In [41]:
X_TH = train_df_TH.drop(columns=[train_df_TH.columns[1], 'username'])
y_TH = train_df_TH[train_df_TH.columns[1]]

model_TH1 = Pipeline(steps=[('scaler', MinMaxScaler()),
                            ('regressor', SVR(C=5.0, epsilon=0.4))])
model_TH1.fit(X_TH, y_TH)

model_TH2 =  Pipeline(steps=[('scaler', None),
                 ('regressor', CatBoostRegressor(depth=9, l2_leaf_reg=1, iterations=400, learning_rate=0.03, verbose = 0, task_type="GPU", devices='0'))])
model_TH2.fit(X_TH, y_TH)

list_model_TH = [model_TH1, model_TH2]
weights_TH = [0.5,  0.5]
voting_TH = WeightedVotingRegressor(models=list_model_TH, weights=weights_TH)

In [42]:
search_model(train_df_CK.drop(columns=[train_df_CK.columns[1], 'username']), train_df_CK[train_df_CK.columns[1]], 42, models)

Fitting 5 folds for each of 18 candidates, totalling 90 fits




#1 Model: ridge
Best parameters: {'regressor__alpha': 5, 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.288297078175872
------------------------------
Fitting 5 folds for each of 36 candidates, totalling 180 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


#2 Model: elastic_net
Best parameters: {'regressor__alpha': 0.01, 'regressor__l1_ratio': 0.2, 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.28815673741425246
------------------------------
Fitting 5 folds for each of 75 candidates, totalling 375 fits
#3 Model: svr
Best parameters: {'regressor__C': 5.0, 'regressor__epsilon': 1, 'regressor__kernel': 'rbf', 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.32589984324434573
------------------------------
Fitting 5 folds for each of 45 candidates, totalling 225 fits
#4 Model: random_forest
Best parameters: {'regressor__max_depth': None, 'regressor__n_estimators': 70, 'scaler': None}
Mean cross-validation score: 0.30040716712098775
------------------------------
Fitting 5 folds for each of 9 candidates, totalling 45 fits
#5 Model: knn_regressor
Best parameters: {'regressor__n_neighbors': 15, 'regressor__weights': 'distance', 'scaler': MinMaxScaler()}
Mean cross-validation score: 0.23955112753032495
------------------------

[Pipeline(steps=[('scaler', MinMaxScaler()), ('regressor', Ridge(alpha=5))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor',
                  ElasticNet(alpha=0.01, l1_ratio=0.2, max_iter=500000))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor', SVR(C=5.0, epsilon=1))]),
 Pipeline(steps=[('scaler', None),
                 ('regressor',
                  RandomForestRegressor(n_estimators=70, random_state=42))]),
 Pipeline(steps=[('scaler', MinMaxScaler()),
                 ('regressor',
                  KNeighborsRegressor(n_neighbors=15, weights='distance'))]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor',
                  <catboost.core.CatBoostRegressor object at 0x7ed06af5d3c0>)]),
 Pipeline(steps=[('scaler', None),
                 ('regressor',
                  MapieRegressor(cv=5,
                                 estimator=GradientBoostingRegressor(max_depth=5,
                        

In [43]:
X_CK = train_df_CK.drop(columns=[train_df_CK.columns[1], 'username'])
y_CK = train_df_CK[train_df_CK.columns[1]]

model_CK1 = Pipeline(steps=[('scaler', MinMaxScaler()),
                            ('regressor', SVR(C=5.0, epsilon=0.4))])
model_CK1.fit(X_CK, y_CK)


model_CK2 = Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor',
                  MapieRegressor(cv=5,
                                 estimator=GradientBoostingRegressor(random_state=42, n_estimators =400, learning_rate=0.05)))])
model_CK2.fit(X_CK, y_CK)

model_CK3 = Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor',
                  RandomForestRegressor(max_depth=10, n_estimators=70,
                                        random_state=42))])
model_CK3.fit(X_CK, y_CK)

model_CK4 = Pipeline(steps=[('scaler', StandardScaler()),
                 ('regressor', CatBoostRegressor(depth=9, l2_leaf_reg=1, iterations=400, learning_rate=0.03, verbose=0))])
model_CK4.fit(X_CK, y_CK)

list_model_CK = [model_CK1, model_CK2, model_CK3, model_CK4]
weights_CK = [0.3, 0.3, 0.1, 0.35]
voting_CK = WeightedVotingRegressor(models=list_model_CK, weights=weights_CK)

## Xuất file dự đoán

In [44]:
def predict_submit(model, df, desc, predict_col):
    tdf = df.copy()
    predicted_target = model.predict(tdf.drop(columns=['username']))
    tdf['target'] = predicted_target
    tdf['target'] = tdf['target'].apply(lambda x: min(max(x, 0), 10))
    output_df = tdf[['username', 'target']]
    output_df.to_csv(f'out_{predict_col}_{desc}.csv', index=False, header=False)

predict_submit(voting_TBTL, predict_df_TL, 'final', 'TBTL')
predict_submit(voting_QT, predict_df_QT, 'final', 'diemqt')
predict_submit(voting_TH, predict_df_TH, 'final', 'TH')
predict_submit(voting_CK, predict_df_CK, 'final', 'CK')

### H2O

In [45]:
# import h2o
# from h2o.automl import H2OAutoML

# h2o.init(max_mem_size="20g") 

# def h2o_automl_predict(train_df, predict_df, target_col, max_runtime_secs=1200):
#     """
#     Trains an H2O AutoML model and makes predictions.

#     Args:
#         train_df: Training DataFrame (pandas).
#         predict_df: Prediction DataFrame (pandas).
#         target_col: Name of the target column.
#         max_runtime_secs: Maximum runtime for AutoML in seconds.

#     Returns:
#         Pandas DataFrame with predictions.
#     """

#     train_hf = h2o.H2OFrame(train_df)
#     predict_hf = h2o.H2OFrame(predict_df)

#     x = train_hf.columns
#     y = target_col
#     x.remove(y)
#     x.remove("username") 

#     aml = H2OAutoML(max_runtime_secs=max_runtime_secs, 
#                     seed=42,
#                     sort_metric="RMSE",
#                     exclude_algos=["StackedEnsemble", "DeepLearning"])
#     aml.train(x=x, y=y, training_frame=train_hf)

#     lb = aml.leaderboard
#     print(lb.head(rows=lb.nrows))

#     preds = aml.predict(predict_hf)

#     preds_df = h2o.as_list(preds)
#     preds_df.columns = ["target"]
    
#     output_df = h2o.as_list(predict_hf[["username"]])
#     output_df["target"] = preds_df["target"]

#     return output_df

# output_df_TBTL = h2o_automl_predict(train_df_TL, predict_df_TL, "TBTL", max_runtime_secs=600)
# output_df_TBTL["target"] = output_df_TBTL["target"].apply(lambda x: min(max(x, 0), 10))
# output_df_TBTL.to_csv("out_TBTL_h2o_automl.csv", index=False, header=False)

# output_df_QT = h2o_automl_predict(train_df_QT, predict_df_QT, "diemqt", max_runtime_secs=600)
# output_df_QT["target"] = output_df_QT["target"].apply(lambda x: min(max(x, 0), 10))
# output_df_QT.to_csv("out_diemqt_h2o_automl.csv", index=False, header=False)

# output_df_TH = h2o_automl_predict(train_df_TH, predict_df_TH, "TH", max_runtime_secs=600)
# output_df_TH["target"] = output_df_TH["target"].apply(lambda x: min(max(x, 0), 10))
# output_df_TH.to_csv("out_TH_h2o_automl.csv", index=False, header=False)

# output_df_CK = h2o_automl_predict(train_df_CK, predict_df_CK, "CK", max_runtime_secs=600)
# output_df_CK["target"] = output_df_CK["target"].apply(lambda x: min(max(x, 0), 10))
# output_df_CK.to_csv("out_CK_h2o_automl.csv", index=False, header=False)

# h2o.cluster().shutdown()

### AutoGluon

In [46]:
# from autogluon.tabular import TabularPredictor

# def autogluon_regressor(train_df, predict_col, eval_metric='r2', presets='best_quality', time_limit=3600):
#     predictor = TabularPredictor(
#         label=predict_col,
#         eval_metric=eval_metric,
#         path=f'autogluon_predictions_{predict_col}'  
#     ).fit(
#         train_data=train_df.drop(columns=['username']),
#         time_limit=time_limit,
#         num_gpus=1,
#         presets=presets
#     )
    
#     leaderboard = predictor.leaderboard()
#     print(leaderboard)
#     return predictor

# autogluon_tbtl = autogluon_regressor(train_df_TL, 'TBTL')
# autogluon_qt = autogluon_regressor(train_df_QT, 'diemqt')
# autogluon_th = autogluon_regressor(train_df_TH, 'TH')
# autogluon_ck = autogluon_regressor(train_df_CK, 'CK')
# predict_submit(autogluon_tbtl, predict_df_TL, 'autogluon', 'TBTL')
# predict_submit(autogluon_qt, predict_df_QT, 'autogluon', 'diemqt')
# predict_submit(autogluon_th, predict_df_TH, 'autogluon', 'TH')
# predict_submit(autogluon_ck, predict_df_CK, 'autogluon', 'CK')

### MLJAR

In [47]:
# from supervised.automl import AutoML

# def mljar_regressor(train_df, predict_col, mode="Compete", eval_metric='r2', total_time_limit=3600):
#     X_train = train_df.drop(columns=['username', predict_col])
#     y_train = train_df[predict_col]

#     automl = AutoML(
#         mode=mode,
#         eval_metric=eval_metric,
#         total_time_limit=total_time_limit,
#         random_state=42,
#         results_path=f'mljar_predictions_{predict_col}',
#         ml_task='regression'
#     )
#     automl.fit(X_train, y_train)
#     return automl

# mljar_tbtl = mljar_regressor(train_df_TL, 'TBTL')
# mljar_qt = mljar_regressor(train_df_QT, 'diemqt')
# mljar_th = mljar_regressor(train_df_TH, 'TH')
# mljar_ck = mljar_regressor(train_df_CK, 'CK')

# predict_submit(mljar_tbtl, predict_df_TL, 'mljar', 'TBTL')
# predict_submit(mljar_qt, predict_df_QT, 'mljar', 'diemqt')
# predict_submit(mljar_th, predict_df_TH, 'mljar', 'TH')
# predict_submit(mljar_ck, predict_df_CK, 'mljar', 'CK')