In [1]:
import pandas
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
def regression_coef(model, X, y):
    coef = pandas.DataFrame(zip(['intercept'] + X.columns.tolist(), [model.intercept_] + model.coef_.tolist()),
                    columns=['predictor', 'coef'])
    X1 = np.append(np.ones((len(X),1)), X, axis=1)
    b = np.append(model.intercept_, model.coef_)
    MSE = np.sum((model.predict(X) - y) ** 2, axis=0) / float(X.shape[0] - X.shape[1])
    var_b = MSE * (np.linalg.inv(np.dot(X1.T, X1)).diagonal())
    sd_b = np.sqrt(var_b)
    t = b / sd_b
    coef['pvalue'] = [2 * (1 - stats.t.cdf(np.abs(i), (len(X1) - 1))) for i in t]
    return coef

In [3]:
data = pandas.read_csv('auto_work.csv',sep=';')
data.head()

Unnamed: 0,name,price,yearOfRegistration,powerPS,kilometer,monthOfRegistration
0,BMW_435i_Sport_coupe,39600,2014,306,30000,7
1,BMW_318d_Aut.__Xenon__Navi__Sportsitze_FESTREIS!,23490,2013,143,40000,6
2,Hyundai_Genesis_Coupe_GT_3.8_V6_Automatik,22999,2012,303,50000,4
3,Mercedes_Benz_E_250_CDI_Mod.2011_Automatik_NAV...,20300,2010,204,80000,12
4,A5_Sportback_2.7_Tdi,18300,2011,190,125000,5


In [4]:
X = data.drop(['price','name'], axis=1)
y = data['price']

In [5]:
X.corr()

Unnamed: 0,yearOfRegistration,powerPS,kilometer,monthOfRegistration
yearOfRegistration,1.0,0.169334,0.018552,0.114724
powerPS,0.169334,1.0,-0.092028,0.291719
kilometer,0.018552,-0.092028,1.0,0.123442
monthOfRegistration,0.114724,0.291719,0.123442,1.0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

In [7]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
model.score(X_train, y_train)

0.583796783259135

In [9]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [10]:
#pandas.DataFrame({'variable':['intercept']+X.columns.tolist(),'coef':[model.intercept_]+model.coef_.tolist()})
regression_coef(model, X, y)

TypeError: data argument can't be an iterator

In [12]:
help(pandas)

Help on package pandas:

NAME
    pandas

DESCRIPTION
    pandas - a powerful data analysis and manipulation library for Python
    
    **pandas** is a Python package providing fast, flexible, and expressive data
    structures designed to make working with "relational" or "labeled" data both
    easy and intuitive. It aims to be the fundamental high-level building block for
    doing practical, **real world** data analysis in Python. Additionally, it has
    the broader goal of becoming **the most powerful and flexible open source data
    analysis / manipulation tool available in any language**. It is already well on
    its way toward this goal.
    
    Main Features
    -------------
    Here are just a few of the things that pandas does well:
    
      - Easy handling of missing data in floating point as well as non-floating
        point data
      - Size mutability: columns can be inserted and deleted from DataFrame and
        higher dimensional objects
      - Automatic and

In [None]:
plt.xlabel('Предсказание')
plt.ylabel('Остатки')
plt.scatter(y_train_pred,  y_train_pred - y_train,
            c='blue', marker='o', label='Обучение')
plt.scatter(y_test_pred,  y_test_pred - y_test,
            c='lightgreen', marker='s', label='Тест')
plt.legend(loc='upper right')
plt.xlim([0, 50000])

In [None]:
print('MSE train: {:.3f}, test: {:.3f}'.format(
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('MAE train: {:.3f}, test: {:.3f}'.format(
        mean_absolute_error(y_train, y_train_pred),
        mean_absolute_error(y_test, y_test_pred)))