<a href="https://colab.research.google.com/github/sokanaid/CourseProject2022BiologicalMath/blob/main/model_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Обучение моделей предсказания числинности на плато

In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
np.random.seed(0)

In [3]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/sim_tables

Mounted at /content/drive/
/content/drive/MyDrive/sim_tables


## Подготовка датасета

In [67]:
file_name = "/content/drive/MyDrive/sim_tables/data_set.csv"
sim_frame = pd.read_csv(file_name, sep='\t').drop("Unnamed: 0", axis = 1)

In [68]:
sim_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290525 entries, 0 to 290524
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   b              290525 non-null  float64
 1   d              290525 non-null  float64
 2   death_r        290525 non-null  float64
 3   dd             290525 non-null  float64
 4   sd_b           290525 non-null  float64
 5   sd_d           290525 non-null  float64
 6   area_length_x  290525 non-null  float64
 7   initial_pop    290525 non-null  float64
 8   plateau_pop    290525 non-null  float64
dtypes: float64(9)
memory usage: 19.9 MB


In [69]:
print("total count of simulations:", len(sim_frame))
df = sim_frame[sim_frame["area_length_x"]==100.0].drop("area_length_x", axis = 1)
df1 = df.drop_duplicates(subset=['b', 'd', 'death_r', 'dd', 'sd_b', 'sd_d', 'initial_pop']).reset_index(drop=True)
print("total count of simulations to use in learning:", len(df))

total count of simulations: 290525
total count of simulations to use in learning: 221711


In [70]:
# рапределение размеров в изначальном датасете
sim_frame["area_length_x"].value_counts()

  100.00    221711
1,100.00      3127
3,100.00      2345
2,100.00      2345
4,100.00      2345
2,500.00      2343
3,300.00      2343
3,500.00      2343
2,300.00      2343
  300.00      2343
1,700.00      2343
3,900.00      2343
2,900.00      2343
4,300.00      2343
4,500.00      2343
  500.00      2343
1,900.00      2343
3,700.00      2343
5,100.00      2343
2,700.00      2343
  700.00      2343
  900.00      2343
4,700.00      2343
1,300.00      2343
1,500.00      2343
5,700.00      2342
5,500.00      2342
4,900.00      2342
5,900.00      2342
5,300.00      2342
2,600.00        14
1,600.00        14
3,600.00        14
  600.00        14
4,600.00        13
5,600.00        13
Name: area_length_x, dtype: int64

In [71]:
pd.options.display.float_format = '{:20,.2f}'.format

In [72]:
# распределение параметров
for column in list(df.columns.values):
  print(column)
  print(df[column].describe())

b
count             221,711.00
mean                    0.67
std                     0.23
min                     0.10
25%                     0.55
50%                     0.75
75%                     0.85
max                     0.95
Name: b, dtype: float64
d
count             221,711.00
mean                    0.36
std                     0.22
min                     0.10
25%                     0.20
50%                     0.30
75%                     0.50
max                     0.90
Name: d, dtype: float64
death_r
count             221,711.00
mean                    5.36
std                     2.96
min                     0.10
25%                     3.00
50%                     5.00
75%                     8.00
max                    10.00
Name: death_r, dtype: float64
dd
count             221,711.00
mean                    0.55
std                     0.29
min                     0.10
25%                     0.30
50%                     0.55
75%                     0.80
max     

In [73]:
df["plateau_pop"].describe()

count             221,711.00
mean                  134.51
std                   757.02
min                     0.00
25%                     0.00
50%                     0.00
75%                     0.00
max                34,227.56
Name: plateau_pop, dtype: float64

In [74]:
X = df.drop("plateau_pop", axis =1)
y = df["plateau_pop"]

In [75]:
X.head()

Unnamed: 0,b,d,death_r,dd,sd_b,sd_d,initial_pop
0,0.35,0.1,3.0,0.9,0.5,0.65,100.0
1,0.45,0.3,7.0,0.7,0.95,0.8,1.0
2,0.95,0.8,9.0,0.2,0.8,0.95,1.0
3,0.35,0.2,1.0,0.6,0.8,0.65,1.0
4,0.35,0.2,6.0,0.1,0.65,0.65,1.0


In [76]:
y.head()

0                   0.00
1                   0.00
2                   0.00
3                   0.00
4                   0.00
Name: plateau_pop, dtype: float64

In [77]:
X_train, X_t, y_train, y_t = train_test_split(X, y, test_size=0.3, random_state = 3)
X_train

Unnamed: 0,b,d,death_r,dd,sd_b,sd_d,initial_pop
91131,0.85,0.80,6.00,0.90,0.80,0.80,100.00
136496,0.35,0.20,3.00,0.40,0.80,0.50,50.00
55046,0.95,0.20,6.00,0.10,0.50,0.95,50.00
9357,0.35,0.20,9.00,0.20,0.80,0.95,50.00
202309,0.75,0.30,9.00,0.90,0.95,0.80,50.00
...,...,...,...,...,...,...,...
62976,0.75,0.60,1.00,0.30,0.80,0.95,1.00
249107,0.95,0.10,6.00,0.10,0.50,0.95,1.00
101002,0.35,0.20,2.00,0.70,0.95,0.80,100.00
259791,0.75,0.20,6.00,0.30,0.50,0.50,50.00


In [78]:
X_test,X_val, y_test, y_val = train_test_split(X_t, y_t, test_size=0.4, random_state = 3)
X_val

Unnamed: 0,b,d,death_r,dd,sd_b,sd_d,initial_pop
108079,0.95,0.50,9.00,0.60,0.95,0.80,100.00
88856,0.75,0.40,1.00,0.50,0.65,0.95,100.00
184608,0.75,0.70,8.00,0.70,0.50,0.95,1.00
142507,0.35,0.10,0.10,0.90,0.50,0.95,1.00
56300,0.75,0.40,7.00,0.10,0.50,0.95,1.00
...,...,...,...,...,...,...,...
237398,0.55,0.30,10.00,0.20,0.50,0.80,100.00
192025,0.25,0.10,1.00,0.80,0.95,0.95,100.00
159130,0.85,0.20,5.00,0.30,0.80,0.95,50.00
225534,0.35,0.20,7.00,0.60,0.95,0.50,100.00


In [79]:
print("train len", len(y_train))
print("test len ", len(y_t))
print("val len ", len(y_val))


train len 155197
test len  66514
val len  26606


In [80]:
# нормализация данных
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)
X_train

Unnamed: 0,b,d,death_r,dd,sd_b,sd_d,initial_pop
0,0.77,1.98,0.22,1.22,0.45,0.45,1.25
1,-1.42,-0.73,-0.80,-0.52,0.45,-1.34,0.02
2,1.21,-0.73,0.22,-1.56,-1.34,1.34,0.02
3,-1.42,-0.73,1.23,-1.22,0.45,1.34,0.02
4,0.33,-0.28,1.23,1.22,1.34,0.45,0.02
...,...,...,...,...,...,...,...
155192,0.33,1.07,-1.47,-0.87,0.45,1.34,-1.18
155193,1.21,-1.19,0.22,-1.56,-1.34,1.34,-1.18
155194,-1.42,-0.73,-1.14,0.52,1.34,0.45,1.25
155195,0.33,-0.73,0.22,-0.87,-1.34,-1.34,0.02


In [81]:
scaler_y = StandardScaler()
scaler_y.fit(y_train.to_frame())
y_train = scaler_y.transform(y_train.to_frame())
y_test = scaler_y.transform(y_test.to_frame())
y_val = scaler_y.transform(y_val.to_frame())
y_train

array([[-0.17736242],
       [-0.17736242],
       [ 0.64752227],
       ...,
       [-0.17736242],
       [-0.17736242],
       [-0.17736242]])

## Обучение моделей

### Линейные регрессии

In [82]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
model = LinearRegression()
model = model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))
print("MSE на тренировочной выборке:")
print(mean_squared_error(scaler_y.inverse_transform(y_train), scaler_y.inverse_transform(model.predict(X_train))))
print("MSE на тестовой выборке:")
print(mean_squared_error(scaler_y.inverse_transform(y_test), scaler_y.inverse_transform(model.predict(X_test))))
model.coef_

0.09731184782514135
0.09932698616161295
MSE на тренировочной выборке:
512047.05769567395
MSE на тестовой выборке:
520263.5109154538


array([[ 0.12849659, -0.18237533, -0.21857941, -0.14586917,  0.00920277,
        -0.00915099, -0.05258099]])

In [83]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha=10)
clf.fit(X_train,y_train)
print('a:',clf.intercept_)
print( "b: ",pd.DataFrame({"name":X_train.columns.values, "coef":clf.coef_}))
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))


a: [1.08048535e-17]
b:            name                 coef
0            b                 0.00
1            d                -0.00
2      death_r                -0.00
3           dd                -0.00
4         sd_b                 0.00
5         sd_d                -0.00
6  initial_pop                -0.00
0.0
-3.423290443627103e-05


In [85]:
from sklearn.linear_model import Ridge
import numpy as np
clf1 = Ridge(alpha=10)
clf1.fit(X_train,y_train)
print('a:',clf1.intercept_)
print(clf1.score(X_train, y_train))
print(clf1.score(X_test, y_test))
print("MSE на тренировочной выборке:")
print(mean_squared_error(scaler_y.inverse_transform(y_train), scaler_y.inverse_transform(clf1.predict(X_train))))
print("MSE на тестовой выборке:")
print(mean_squared_error(scaler_y.inverse_transform(y_test), scaler_y.inverse_transform(clf1.predict(X_test))))
clf1.coef_

a: [-3.40308811e-17]
0.09731184709113061
0.09932680093682311
MSE на тренировочной выборке:
512047.0581120392
MSE на тестовой выборке:
520263.61790844455


array([[ 0.1284759 , -0.18235349, -0.21856473, -0.14585975,  0.00920224,
        -0.00915045, -0.05257793]])

### Решающие деревья

In [86]:
X_test,X_val, y_test, y_val = train_test_split(X_t, y_t, test_size=0.4, random_state = 3)
X_test,X_val, y_test, y_val = train_test_split(X_t, y_t, test_size=0.4, random_state = 3)

In [93]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 300, max_features = 'sqrt', max_depth = 10, min_samples_leaf=10, random_state = 18)
rf.fit(X_train, y_train)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))
print("MSE на тренировочной выборке:")
print(mean_squared_error(y_train, rf.predict(X_train)))
print("MSE на тестовой выборке:")
print(mean_squared_error(y_test, rf.predict(X_test)))

  rf.fit(X_train, y_train)


0.8572756724092407
-0.03306120576049576
MSE на тренировочной выборке:
0.14272432759075918
MSE на тестовой выборке:
596736.042538905


In [100]:
from sklearn.ensemble import BaggingRegressor
br = BaggingRegressor(n_estimators = 50, random_state = 18)
br.fit(X_train, y_train)
print(br.score(X_train, y_train))
print(br.score(X_test, y_test))
print("MSE на тренировочной выборке:")
print(mean_squared_error(y_train, br.predict(X_train)))
print("MSE на тестовой выборке:")
print(mean_squared_error(y_test, br.predict(X_test)))

  return column_or_1d(y, warn=True)


0.9896448666958134
-0.033067260497136486
MSE на тренировочной выборке:
0.010355133304186536
MSE на тестовой выборке:
596739.5399885832


### обучение нейронной модели