In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

data = pd.read_csv("/content/jamb_exam_results.csv")

data.columns = data.columns.str.lower().str.replace(' ', '_')

In [3]:
data = data.drop("student_id", axis=1)

In [4]:
data.fillna(0)

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,15,Male,High,0,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,22,Female,Medium,Tertiary,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,183,20,74,2,10.6,Public,Urban,Yes,No,Low,Low,16,Male,Medium,Primary,2
4996,179,0,80,2,20.0,Public,Rural,No,Yes,Medium,Medium,22,Male,Low,Secondary,1
4997,261,17,89,3,11.3,Public,Urban,No,No,Low,High,18,Male,Medium,Primary,3
4998,183,15,96,2,15.9,Public,Rural,No,No,Low,Medium,18,Male,Medium,Secondary,1


In [5]:
X = data
y = data["jamb_score"]
X = X.drop("jamb_score", axis=1)
X.fillna(0)
X['parent_education_level'] = X['parent_education_level'].replace(np.nan, 0)
print(X.isnull().sum())

study_hours_per_week            0
attendance_rate                 0
teacher_quality                 0
distance_to_school              0
school_type                     0
school_location                 0
extra_tutorials                 0
access_to_learning_materials    0
parent_involvement              0
it_knowledge                    0
age                             0
gender                          0
socioeconomic_status            0
parent_education_level          0
assignments_completed           0
dtype: int64


In [6]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.4, random_state = 1)

X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state = 1)

In [7]:
vec = DictVectorizer(sparse=True)


X_train_dict = X_train.to_dict(orient='records')
X_valid_dict = X_val.to_dict(orient='records')
X_test_dict = X_test.to_dict(orient='records')

X_train = vec.fit_transform(X_train_dict)
X_valid = vec.transform(X_valid_dict)
X_test = vec.transform(X_test_dict)

**Вопрос 1**

In [8]:
model = DecisionTreeRegressor(max_depth=1, random_state=1)
model.fit(X_train, y_train)

best_feature_index = model.tree_.feature[0]
feature_name = vec.get_feature_names_out()[best_feature_index]

print("Признак, использованный для разбиения:", feature_name)

Признак, использованный для разбиения: study_hours_per_week


**Вопрос 2**

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


model_rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model_rf.fit(X_train, y_train)

y_pred_valid = model_rf.predict(X_valid)

# Рассчитываем RMSE
rmse_valid = np.sqrt(mean_squared_error(y_val, y_pred_valid))

# Выводим RMSE
print(f"RMSE на валидационных данных: {rmse_valid}")

RMSE на валидационных данных: 42.424566585882765


**Вопрос 3**

In [10]:
rmse_values = []

# Перебираем значения n_estimators от 10 до 200 с шагом 10
for n in range(10, 201, 10):

    model_rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    model_rf.fit(X_train, y_train)

    y_pred_valid = model_rf.predict(X_valid)
    rmse_valid = np.sqrt(mean_squared_error(y_val, y_pred_valid))

    # Добавляем RMSE в список
    rmse_values.append((n, round(rmse_valid, 3)))

# Выводим результаты
for n, rmse in rmse_values:
    print(f"n_estimators={n}, RMSE={rmse}")

n_estimators=10, RMSE=42.425
n_estimators=20, RMSE=41.223
n_estimators=30, RMSE=40.977
n_estimators=40, RMSE=40.709
n_estimators=50, RMSE=40.502
n_estimators=60, RMSE=40.323
n_estimators=70, RMSE=40.289
n_estimators=80, RMSE=40.266
n_estimators=90, RMSE=40.248
n_estimators=100, RMSE=40.309
n_estimators=110, RMSE=40.319
n_estimators=120, RMSE=40.315
n_estimators=130, RMSE=40.306
n_estimators=140, RMSE=40.271
n_estimators=150, RMSE=40.303
n_estimators=160, RMSE=40.285
n_estimators=170, RMSE=40.28
n_estimators=180, RMSE=40.262
n_estimators=190, RMSE=40.276
n_estimators=200, RMSE=40.256


**Вопрос 4**

In [11]:
avg_rmse_results = {}
max_depth_values = [10, 15, 20, 25]
n_estimators_values = range(10, 201, 10)

for max_depth in max_depth_values:
    rmse_for_max_depth = []

    for n_estimators in n_estimators_values:
        model_rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=-1)
        model_rf.fit(X_train, y_train)

        y_pred_valid = model_rf.predict(X_valid)
        rmse_valid = np.sqrt(mean_squared_error(y_val, y_pred_valid))
        rmse_for_max_depth.append(rmse_valid)

    # Рассчитываем среднее значение RMSE для данного max_depth
    avg_rmse_results[max_depth] = np.mean(rmse_for_max_depth)

# Выводим средние значения RMSE для каждого max_depth
for max_depth, avg_rmse in avg_rmse_results.items():
    print(f"max_depth: {max_depth}, Среднее RMSE: {round(avg_rmse, 3)}")

# Находим значение max_depth с наименьшим средним RMSE
best_max_depth = min(avg_rmse_results, key=avg_rmse_results.get)
print(f"\nЛучшее значение max_depth по среднему RMSE: {best_max_depth}")

max_depth: 10, Среднее RMSE: 40.226
max_depth: 15, Среднее RMSE: 40.409
max_depth: 20, Среднее RMSE: 40.494
max_depth: 25, Среднее RMSE: 40.502

Лучшее значение max_depth по среднему RMSE: 10


**Вопрос 5**

In [12]:
model_rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
model_rf.fit(X_train, y_train)

# Извлекаем важность признаков
feature_importances = model_rf.feature_importances_

# Находим индекс наиболее важного признака
most_important_feature_index = np.argmax(feature_importances)

# Получаем имя самого важного признака
most_important_feature_name = vec.get_feature_names_out()[most_important_feature_index]

# Выводим информацию о важности признаков
print(f"Наиболее важный признак: {most_important_feature_name}")
print(f"Важность этого признака: {feature_importances[most_important_feature_index]:.4f}")

Наиболее важный признак: study_hours_per_week
Важность этого признака: 0.2541


**Вопрос 6**

In [13]:
!pip install xgboost



In [14]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_val)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model_0_3 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# Оценка RMSE на валидационном наборе данных с eta=0.3
y_pred_valid_0_3 = model_0_3.predict(dvalid)
rmse_valid_0_3 = np.sqrt(mean_squared_error(y_val, y_pred_valid_0_3))
print(f"RMSE на валидации с eta=0.3: {rmse_valid_0_3}")

xgb_params['eta'] = 0.1

# Обучаем модель с eta=0.1
model_0_1 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# Оценка RMSE на валидационном наборе данных с eta=0.1
y_pred_valid_0_1 = model_0_1.predict(dvalid)
rmse_valid_0_1 = np.sqrt(mean_squared_error(y_val, y_pred_valid_0_1))
print(f"RMSE на валидации с eta=0.1: {rmse_valid_0_1}")

[0]	train-rmse:42.84835	valid-rmse:45.00154
[1]	train-rmse:39.96423	valid-rmse:43.23126
[2]	train-rmse:37.91231	valid-rmse:41.88497
[3]	train-rmse:36.51126	valid-rmse:41.34235
[4]	train-rmse:35.52212	valid-rmse:40.73916
[5]	train-rmse:34.77126	valid-rmse:40.52718
[6]	train-rmse:34.03898	valid-rmse:40.41024
[7]	train-rmse:33.62820	valid-rmse:40.22150
[8]	train-rmse:32.94729	valid-rmse:40.12197
[9]	train-rmse:32.27703	valid-rmse:39.99663
[10]	train-rmse:31.73818	valid-rmse:40.03250
[11]	train-rmse:31.31360	valid-rmse:40.09954
[12]	train-rmse:30.72949	valid-rmse:40.25917
[13]	train-rmse:30.11486	valid-rmse:40.36211
[14]	train-rmse:29.43538	valid-rmse:40.56702
[15]	train-rmse:29.23018	valid-rmse:40.63507
[16]	train-rmse:28.64113	valid-rmse:40.74688
[17]	train-rmse:28.42128	valid-rmse:40.81394
[18]	train-rmse:28.36245	valid-rmse:40.83695
RMSE на валидации с eta=0.3: 40.821371574916306
[0]	train-rmse:45.64414	valid-rmse:47.20093
[1]	train-rmse:44.26862	valid-rmse:46.15000
[2]	train-rmse:43.0

In [None]:
"""
1)Признак, использованный для разбиения: study_hours_per_week
2)42.42
3)90
4)10
5)study_hours_per_week
6)0.1
"""