In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

Задача: предполагается что есть зависимость отношения числа показов в блоках Premium (над результатами поиска) и Other (под результатами поиска) в зависимости от нашей ставки Bid и цен конкурентов p11,p12,p13,p21,p22,p23,p24, но это не точно. Нужно эту зависимость найти, либо обосновать ее отсутствие.

In [2]:
data=pd.read_csv('ShowsAndPrices.csv', delimiter=';' ,error_bad_lines=False)

In [3]:
# Переведем все значения в числовой формат
for i in data.columns:
    try:
        data[i]=data[i].astype(str)
        data[i] = [x.replace(',', '.') for x in data[i]]
        data[i]=data[i].astype(float)
    except:
        next
# В Bid tcmn незначительное количество пропусков. Просто уберем их
data=data[data['Bid'] >0 ]

In [4]:
data['agg']=data['p11']*data['p12']*data['p13']*data['p21']*data['p22']*data['p23']*data['p24']
data['mean']=(data['p11']+data['p12']+data['p13']+data['p21']+data['p22']+data['p23']+data['p24'])/7
y = data['Premium'] / (data['Premium'] + data['Other'])
nd=np.log(data)
y=np.log(y)
del nd['Premium']
del nd['Other']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(nd, y, test_size=0.30, random_state=42)

In [6]:
# обучимся
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
# Предскажем
y_pred=model.predict(X_test)
print('cross_val_score', cross_val_score(model, X_train, y_train))
print('mean_squared_error' , mean_squared_error(y_test, y_pred))
# Посмотрим на коэфициенты регрессии
coefs=pd.DataFrame(columns=['name','coef'])
coefs['coef'], coefs['name'] = list(model.coef_), X_train.columns
coefs.sort_values(by='coef' , ascending=False )

cross_val_score [0.1003014  0.09830538 0.09385849]
mean_squared_error 0.27046857543329234


Unnamed: 0,name,coef
1,p11,51296850000.0
2,p12,51296850000.0
7,p24,51296850000.0
3,p13,51296850000.0
5,p22,51296850000.0
6,p23,51296850000.0
4,p21,51296850000.0
0,Bid,0.1860096
9,mean,-0.1726561
8,agg,-51296850000.0


### Видна зависимость фичей от таргета, но уравнение регрессии получается некрасивое, поэтому предложим модели самой отобрать важные фичи

In [7]:
clf = Lasso(alpha=0.001)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
print('cross_val_score',cross_val_score(clf, X_train, y_train))
print('mean_squared_error' ,mean_squared_error(y_test, y_pred))

coefs=pd.DataFrame(columns=['name','coef'])
coefs['coef'], coefs['name'] = list(clf.coef_), X_train.columns
coefs.sort_values(by='coef' , ascending=False )

cross_val_score [0.09988439 0.09762704 0.09329511]
mean_squared_error 0.270576482746287


Unnamed: 0,name,coef
0,Bid,0.182538
2,p12,0.030147
7,p24,0.024187
1,p11,0.01693
8,agg,0.00873
3,p13,0.0
6,p23,-0.0
5,p22,-0.012856
9,mean,-0.08975
4,p21,-0.175884


### Уберем фичи, которые модель посчитала лишними.

In [8]:
del nd['p11']
del nd['p24']

#### Повторим

In [9]:
X_train, X_test, y_train, y_test = train_test_split(nd, y, test_size=0.30, random_state=42)

In [10]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

y_pred=model.predict(X_test)
print('cross_val_score', cross_val_score(model, X_train, y_train))
print('mean_squared_error' , mean_squared_error(y_test, y_pred))

coefs=pd.DataFrame(columns=['name','coef'])
coefs['coef'], coefs['name'] = list(model.coef_), X_train.columns
coefs.sort_values(by='coef' , ascending=False )

cross_val_score [0.10024153 0.09828324 0.0938094 ]
mean_squared_error 0.27048641172670185


Unnamed: 0,name,coef
0,Bid,0.185608
6,agg,0.049778
1,p12,0.000678
2,p13,-0.010624
4,p22,-0.045839
5,p23,-0.0563
7,mean,-0.150342
3,p21,-0.236044


### Проверим все ли фичи остались важны

In [11]:
clf = Lasso(alpha=0.001)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
print('cross_val_score',cross_val_score(clf, X_train, y_train))
print('mean_squared_error' ,mean_squared_error(y_test, y_pred))

coefs=pd.DataFrame(columns=['name','coef'])
coefs['coef'], coefs['name'] = list(clf.coef_), X_train.columns
coefs.sort_values(by='coef' , ascending=False )

cross_val_score [0.09990344 0.09743729 0.09314349]
mean_squared_error 0.27057971094631544


Unnamed: 0,name,coef
0,Bid,0.182383
6,agg,0.022872
1,p12,0.010402
2,p13,0.0
5,p23,-0.001253
4,p22,-0.030637
7,mean,-0.084777
3,p21,-0.20004


### Удалим лишнюю фичу и повторим

In [12]:
del nd['p13']
X_train, X_test, y_train, y_test = train_test_split(nd, y, test_size=0.30, random_state=42)

In [13]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

y_pred=model.predict(X_test)
print('cross_val_score', cross_val_score(model, X_train, y_train))
print('mean_squared_error' , mean_squared_error(y_test, y_pred))

coefs=pd.DataFrame(columns=['name','coef'])
coefs['coef'], coefs['name'] = list(model.coef_), X_train.columns
coefs.sort_values(by='coef' , ascending=False )

cross_val_score [0.10025664 0.09826356 0.09380412]
mean_squared_error 0.2704805438987709


Unnamed: 0,name,coef
0,Bid,0.185589
5,agg,0.048489
1,p12,-0.001499
3,p22,-0.044938
4,p23,-0.053692
6,mean,-0.1482
2,p21,-0.241223


In [14]:
clf = Lasso(alpha=0.001)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
print('cross_val_score',cross_val_score(clf, X_train, y_train))
print('mean_squared_error' ,mean_squared_error(y_test, y_pred))

coefs=pd.DataFrame(columns=['name','coef'])
coefs['coef'], coefs['name'] = list(clf.coef_), X_train.columns
coefs.sort_values(by='coef' , ascending=False )
print('Intercept', model.intercept_)

cross_val_score [0.09990299 0.09743729 0.09314349]
mean_squared_error 0.27057971487245314
Intercept -0.3983220754686642


Таким образом, все неважные фичи удалены. При дальнейших корректировках возрастает среднеквадратичная ошибка.
Будем считать, что найдено оптимальное уравнение регресии.
Наиболее влияющие коэфициенты: ставка Bid (0.182), p21 (-0.199) и mean (-0.084)
Финальная модель будет выглядеть следующим образом 


In [None]:
target = -0.398 + 0.182 * data['bid'] - 0.199*data['p21'] - 0.084*data['mean'] + \
0.0223*data['agg'] + 0.011*p12 - 0.030*data['p22']

Для упрощения модели и небольшой потере в качесве (0.06%) но ускорении времени обучения в дальнейшем уберем все кореллирующие признаки и (или признаки с ннзкими коэфициентами). Построим регрессию

In [15]:
del nd['p12']
del nd['agg']
del nd['p22']
del nd['p23']
del nd['mean']
X_train, X_test, y_train, y_test = train_test_split(nd, y, test_size=0.30, random_state=42)

In [16]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

y_pred=model.predict(X_test)
print('cross_val_score', cross_val_score(model, X_train, y_train))
print('mean_squared_error' , mean_squared_error(y_test, y_pred))

coefs=pd.DataFrame(columns=['name','coef'])
coefs['coef'], coefs['name'] = list(model.coef_), X_train.columns
coefs.sort_values(by='coef' , ascending=False )
print('Intercept', model.intercept_)

cross_val_score [0.09441036 0.09231992 0.08780369]
mean_squared_error 0.2722267462262108
Intercept -0.41338064438160355


In [None]:
Тогда уравнение будет вида
target = -0.41 + 0.178*data['Bid'] -0.141*data['p21']