# Задание: примените методы регрессии (linreg, DTree, kNN) для данных, полученных путем парсинга сайта.

Сайт и количество признаков определяйте самостоятельно.

Сделайте соответствующие выводы.

Создайте форму и соответствующий функционал для предсказания целевой функции

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import datetime

In [2]:
def get_html(url):
    r = requests.get(url)
    return r.text

def write_csv(data):
    with open('imdb.csv', 'a', encoding = 'Utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(('Name', 'Year', 'Rating'))
        
        for i in range(len(data)):
            writer.writerow((data[i]['Name'], 
                        data[i]['Year'], 
                        data[i]['IMDB Rating']))
    
def get_page_data(html):
    soup = BeautifulSoup(html)
    table = soup.find('tbody', class_='lister-list')
    
    popular_movies=[]
    name=[]
    year=[]
    rating=[]
    
    for row in table.find_all('tr'):
        cols = row.find_all('td')
        popular_movies.append ({
            'Name': cols[1].a.text,
            'Year': pd.to_datetime(cols[1].span.text[1:5]).year,
            'IMDB Rating': cols[2].text.strip()
        } )   
    write_csv(popular_movies)

def main():
    url = 'https://www.imdb.com/chart/moviemeter'
    html = get_html(url)
    get_page_data(html)

if __name__ == '__main__':
    main()

In [3]:
imdb = pd.read_csv('imdb.csv')
imdb['Rank'] = imdb.index + 1
imdb

Unnamed: 0,Name,Year,Rating,Rank
0,Ирландец,2019,8.3,1
1,Холодное сердце 2,2019,7.3,2
2,Достать ножи,2019,8.1,3
3,Однажды в... Голливуде,2019,7.9,4
4,Ford против Ferrari,2019,8.3,5
...,...,...,...,...
95,Хан Соло: Звездные войны. Истории,2018,6.9,96
96,3022,2019,4.4,97
97,"Planes, Trains & Automobiles",1987,7.6,98
98,E.T. the Extra-Terrestrial,1982,7.8,99


In [4]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
Name      100 non-null object
Year      100 non-null int64
Rating    86 non-null float64
Rank      100 non-null int64
dtypes: float64(1), int64(2), object(1)
memory usage: 3.2+ KB


In [5]:
imdb.Rating.fillna(-1, inplace = True)

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
imdb['Name'] = le.fit_transform(imdb['Name'].astype(str))

In [7]:
y = imdb.Rating
X = imdb.drop('Rating', axis=1)

In [8]:
X.shape, y.shape

((100, 3), (100,))

In [9]:
imdb.head()

Unnamed: 0,Name,Year,Rating,Rank
0,37,2019,8.3,1
1,91,2019,7.3,2
2,29,2019,8.1,3
3,61,2019,7.9,4
4,8,2019,8.3,5


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) 

In [12]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape, 

((80, 3), (20, 3), (80,), (20,))

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

In [14]:
treer = DecisionTreeRegressor(max_depth=2)
linear = LinearRegression()
knr = KNeighborsRegressor(n_neighbors = 15)

In [15]:
treer.fit(X_train, y_train), linear.fit(X_train, y_train), knr.fit(X_train, y_train)

(DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort=False, random_state=None, splitter='best'),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform'))

In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [17]:
mean_squared_error(y_valid, treer.predict(X_valid)), mean_absolute_error(y_valid, treer.predict(X_valid))

(10.818374999999996, 1.8624999999999996)

In [18]:
mean_squared_error(y_valid, linear.predict(X_valid)), mean_absolute_error(y_valid, linear.predict(X_valid))

(10.324429353005309, 2.274364957585117)

In [19]:
mean_squared_error(y_valid, knr.predict(X_valid)), mean_absolute_error(y_valid, knr.predict(X_valid))

(11.685573333333334, 2.4873333333333334)