<a href="https://colab.research.google.com/github/sokanaid/-/blob/master/model_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Обучение моделей предсказания числинности на плато

In [117]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
np.random.seed(0)

In [118]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/sim_tables

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/sim_tables


## Подготовка датасета

In [119]:
file_name = "/content/drive/MyDrive/sim_tables/data_set1.csv"
sim_frame = pd.read_csv(file_name, sep='\t').drop("Unnamed: 0", axis = 1)

In [120]:
sim_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230736 entries, 0 to 230735
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   b              230736 non-null  float64
 1   d              230736 non-null  float64
 2   death_r        230736 non-null  float64
 3   dd             230736 non-null  float64
 4   sd_b           230736 non-null  float64
 5   sd_d           230736 non-null  float64
 6   area_length_x  230736 non-null  float64
 7   initial_pop    230736 non-null  float64
 8   plateau_pop    230736 non-null  float64
dtypes: float64(9)
memory usage: 15.8 MB


In [121]:
print("total count of simulations:", len(sim_frame))
df = sim_frame[sim_frame["area_length_x"]==100.0].drop("area_length_x", axis = 1)
df1 = df.drop_duplicates(subset=['b', 'd', 'death_r', 'dd', 'sd_b', 'sd_d', 'initial_pop']).reset_index(drop=True)
print("total count of simulations to use in learning:", len(df))

total count of simulations: 230736
total count of simulations to use in learning: 162704


In [122]:
# рапределение размеров в изначальном датасете
sim_frame["area_length_x"].value_counts()

100.0     162704
1100.0      2345
3100.0      2345
2100.0      2345
4100.0      2345
2500.0      2343
3300.0      2343
3500.0      2343
2300.0      2343
300.0       2343
1700.0      2343
3900.0      2343
2900.0      2343
4300.0      2343
4500.0      2343
500.0       2343
1900.0      2343
3700.0      2343
5100.0      2343
2700.0      2343
700.0       2343
900.0       2343
4700.0      2343
1300.0      2343
1500.0      2343
5700.0      2342
5500.0      2342
4900.0      2342
5900.0      2342
5300.0      2342
2600.0        14
1600.0        14
3600.0        14
600.0         14
4600.0        13
5600.0        13
Name: area_length_x, dtype: int64

In [123]:
# распределение параметров
for column in list(df.columns.values):
  print(column)
  print(df[column].describe())

b
count    162704.000000
mean          0.668536
std           0.230048
min           0.100000
25%           0.550000
50%           0.750000
75%           0.850000
max           0.950000
Name: b, dtype: float64
d
count    162704.000000
mean          0.359525
std           0.220228
min           0.100000
25%           0.200000
50%           0.300000
75%           0.500000
max           0.900000
Name: d, dtype: float64
death_r
count    162704.000000
mean          4.986707
std           3.064961
min           0.100000
25%           2.000000
50%           5.000000
75%           8.000000
max          10.000000
Name: death_r, dtype: float64
dd
count    162704.000000
mean          0.549765
std           0.287154
min           0.100000
25%           0.300000
50%           0.550000
75%           0.800000
max           1.000000
Name: dd, dtype: float64
sd_b
count    162704.000000
mean          0.725316
std           0.167597
min           0.500000
25%           0.600000
50%           0.750000
75%

In [124]:
X = df.drop("plateau_pop", axis =1)
y = df["plateau_pop"]

In [125]:
X.head()

Unnamed: 0,b,d,death_r,dd,sd_b,sd_d,initial_pop
0,0.45,0.3,7.0,0.7,0.95,0.8,1.0
1,0.95,0.8,9.0,0.2,0.8,0.95,1.0
2,0.35,0.2,1.0,0.6,0.8,0.65,1.0
3,0.35,0.2,6.0,0.1,0.65,0.65,1.0
4,0.65,0.1,3.0,0.8,0.65,0.5,1.0


In [126]:
y.head()

0    2.403756e-111
1    4.701355e-110
2    8.296506e-112
3    3.113991e-109
4     1.366164e+02
Name: plateau_pop, dtype: float64

In [127]:
X_train, X_t, y_train, y_t = train_test_split(X, y, test_size=0.3)
X_train

Unnamed: 0,b,d,death_r,dd,sd_b,sd_d,initial_pop
189335,0.35,0.3,1.0,0.6,0.50,0.95,50.0
10822,0.95,0.7,7.0,0.8,0.65,0.50,1.0
85367,0.85,0.7,6.0,0.7,0.95,0.65,50.0
83650,0.55,0.5,4.0,0.1,0.50,0.65,50.0
58944,0.45,0.4,5.0,0.4,0.80,0.95,1.0
...,...,...,...,...,...,...,...
138489,0.75,0.2,3.0,0.9,0.95,0.80,1.0
136032,0.95,0.3,7.0,0.9,0.50,0.50,1.0
215975,0.85,0.3,2.0,0.3,0.80,0.80,100.0
167260,0.65,0.6,9.0,0.6,0.65,0.50,1.0


In [128]:
X_test,X_val, y_test, y_val = train_test_split(X_t, y_t, test_size=0.4)
X_val

Unnamed: 0,b,d,death_r,dd,sd_b,sd_d,initial_pop
159508,0.65,0.4,8.0,0.5,0.65,0.50,50.0
162049,0.95,0.3,2.0,0.7,0.95,0.65,1.0
66505,0.35,0.1,2.0,0.1,0.65,0.65,50.0
22520,0.15,0.1,8.0,0.4,0.95,0.65,50.0
79189,0.45,0.3,1.0,0.9,0.95,0.95,1.0
...,...,...,...,...,...,...,...
190612,0.95,0.2,10.0,0.9,0.95,0.65,50.0
53250,0.85,0.2,3.0,0.5,0.95,0.65,1.0
65462,0.75,0.7,1.0,0.4,0.80,0.50,50.0
37963,0.55,0.2,5.0,0.2,0.65,0.80,1.0


In [129]:
print("train len", len(y_train))
print("test len ", len(y_t))
print("val len ", len(y_val))


train len 113892
test len  48812
val len  19525


In [130]:
# нормализация данных
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)
X_train

Unnamed: 0,b,d,death_r,dd,sd_b,sd_d,initial_pop
0,-1.383607,-0.267379,-1.297641,0.174756,-1.342754,1.335890,0.619928
1,1.226558,1.549396,0.660147,0.870949,-0.447742,-1.346037,-0.950457
2,0.791531,1.549396,0.333849,0.522852,1.342282,-0.452061,0.619928
3,-0.513552,0.641009,-0.318747,-1.565726,-1.342754,-0.452061,0.619928
4,-0.948579,0.186815,0.007551,-0.521437,0.447270,1.335890,-0.950457
...,...,...,...,...,...,...,...
113887,0.356503,-0.721573,-0.645045,1.219045,1.342282,0.441914,-0.950457
113888,1.226558,-0.267379,0.660147,1.219045,-1.342754,-1.346037,-0.950457
113889,0.791531,-0.267379,-0.971343,-0.869534,0.447270,0.441914,2.222362
113890,-0.078524,1.095202,1.312743,0.174756,-0.447742,-1.346037,-0.950457


In [131]:
'''scaler_y = StandardScaler()
scaler_y.fit(y_train.to_frame())
y_train = scaler_y.transform(y_train.to_frame())
y_test = scaler_y.transform(y_test.to_frame())
y_val = scaler_y.transform(y_val.to_frame())
y_train'''

'scaler_y = StandardScaler()\nscaler_y.fit(y_train.to_frame())\ny_train = scaler_y.transform(y_train.to_frame())\ny_test = scaler_y.transform(y_test.to_frame())\ny_val = scaler_y.transform(y_val.to_frame())\ny_train'

## Обучение моделей

### Линейные регрессии

In [137]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
model = LinearRegression()
model = model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))
print("MSE на тренировочной выборке:")
print(mean_squared_error(y_train, model.predict(X_train)))
print("MSE на тестовой выборке:")
print(mean_squared_error(y_test, model.predict(X_test)))


0.10558311854231828
0.09959223743318912
MSE на тренировочной выборке:
660121.421849714
MSE на тестовой выборке:
726141.5373805251


In [138]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha=10)
clf.fit(X_train,y_train)
print('a:',clf.intercept_)
print( "b: ",pd.DataFrame({"name":X_train.columns.values, "coef":clf.coef_}))
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))
print("MSE на тренировочной выборке:")
print(mean_squared_error(y_train, clf.predict(X_train)))
print("MSE на тестовой выборке:")
print(mean_squared_error(y_test, clf.predict(X_test)))

a: 166.90244628922363
b:            name        coef
0            b   94.154157
1            d -144.620626
2      death_r -191.693994
3           dd -121.919984
4         sd_b    0.000000
5         sd_d   -0.000000
6  initial_pop  -34.634953
0.10438598320413173
0.09800741360447196
MSE на тренировочной выборке:
661004.9636275731
MSE на тестовой выборке:
727419.6321052767


In [139]:
from sklearn.linear_model import Ridge
import numpy as np
clf1 = Ridge(alpha=10)
clf1.fit(X_train,y_train)
print('a:',clf1.intercept_)
print(clf1.score(X_train, y_train))
print(clf1.score(X_test, y_test))
print("MSE на тренировочной выборке:")
print(mean_squared_error(y_train, clf1.predict(X_train)))
print("MSE на тестовой выборке:")
print(mean_squared_error(y_test, clf1.predict(X_test)))

a: 166.90244628922363
0.10558311690710953
0.09959176065642394
MSE на тренировочной выборке:
660121.4230565745
MSE на тестовой выборке:
726141.9218812232


Решающие деревья