# Syed Razvi Project 2

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import time
import random
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, Ridge
import pickle
import datetime as dt
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold

%matplotlib inline

# Obtaining Data from myanimelist.com

In [None]:
# List of season URL's to pull show links from starting with year x going to end of 2020
x = 2010
season_list=[]
seasons = ['winter','spring','summer','fall']
for i in range(x,2021):
    for j in seasons:
        season_list.append(str(i)+'/'+j)
print(season_list)

In [None]:
# Pull individual show links from season pages
url = "https://myanimelist.net/anime/season/{}"
show_links = []
for i in season_list:
    url2 = url.format(i)
    season_page = requests.get(url2)
    season_soup = BeautifulSoup(season_page.text)
    for link in season_soup.find_all('div',class_='seasonal-anime js-seasonal-anime'):
        if link.find('div',class_='info').text.strip('\n ')[:2] == 'TV':
            show_links.append(link.find('a',class_='link-title').get('href'))
    time.sleep(.5+2*random.random())
show_links

In [None]:
# Removing Duplicate Show Links
show_links2 = list(dict.fromkeys(show_links))

## Scraping Data from Individual Show Pages

In [None]:
# Scraping Individual Show Data
Df_Row = []
for show in show_links2:
    show_page = requests.get(show+'/stats')
    show_soup = BeautifulSoup(show_page.text)
    Genres = []
    title = show_soup.title.text.strip('\n').replace(' - Statistics - MyAnimeList.net','')
    for box in show_soup.find('div', style='width: 225px').find_all('div'):
        if 'Type:' in box.text:
            media = box.text.replace('Type:','').strip('\n ')
        if 'Episodes:' in box.text:
            episodes = box.text.replace('Episodes:','').strip('\n ')
        if 'Status:' in box.text:
            status = box.text.replace('Status:','').strip('\n ')
        if 'Aired:' in box.text:
            aired = box.text.replace('Aired:','').strip('\n ')
        if 'Broadcast:' in box.text:
            broad = box.text.replace('Broadcast:','').strip('\n ')
        if 'Licensors:' in box.text:
            license = box.text.replace('Licensors:','').strip('\n ')
        if 'Source:' in box.text:
            source = box.text.replace('Source:','').strip('\n ')
        if 'Genres:' in box.text:
            for genre in box.find_all('span', itemprop='genre'):
                Genres.append(genre.text)
        if 'Duration:' in box.text:
            duration = box.text.replace('Duration:','').strip('\n ')
        if 'Rating:' in box.text:
            rating = box.text.replace('Rating:','').strip('\n ')
        if 'Score:' in box.text:
            try:
                score = float(box.find('span', itemprop='ratingValue').text)
            except:
                score = float(0)
    for i in show_soup.find_all('div', class_='spaceit_pad'):
        if 'Watching:' in i.text:
            watched = int(i.text.replace('Watching: ','').replace(',',''))
        if 'Completed:' in i.text:
            completed = int(i.text.replace('Completed: ','').replace(',',''))
        if 'Dropped:' in i.text:
            dropped = int(i.text.replace('Dropped: ','').replace(',',''))
        if 'Total:' in i.text:
            total = int(i.text.replace('Total: ','').replace(',',''))
    Row_List = [title,media, episodes,status,aired,broad,license,source,Genres,duration,rating,watched,completed,dropped,total,score]
    Df_Row.append(Row_List)
    time.sleep(.5+2*random.random())
Df_Row

In [None]:
# Convert Scraped data list into dataframe
Full_DF = DataFrame(Df_Row,columns=['Title','Media_Type','Episodes','Status','Aired','Broadcast','Licensed','Source','Genres','Runtime','Age_Rating','Watched','Completed','Dropped','Total','Score'])

# Cleaning up and Formatting Data

In [None]:
#Focus on shows that have finished airing by removing currently airing or not yet aired shows
Data_Set = Full_DF[Full_DF['Status'] == 'Finished Airing']
Data_Set['Episodes'] = Data_Set['Episodes'].astype(int)
Data_Set = Data_Set.reset_index(drop=True)

In [None]:
#Dummy Variables for Age Rating
Data_Set = pd.concat([Data_Set.drop('Age_Rating', axis=1), pd.get_dummies(Data_Set['Age_Rating'])], axis=1)
Data_Set

In [None]:
# Fixing Runtimes to be Numerical
x='40'
Data_Set['Runtime'] = Data_Set['Runtime'].replace(x+'/60','0')
Data_Set['Runtime'] = Data_Set['Runtime'].astype(int)
Data_Set['Runtime'].value_counts()

In [None]:
# Adaptation Source Dummy
Data_Set['Source'] = Data_Set['Source'].replace('Radio','Other')
Data_Set['Source'].value_counts()
Data_Set = pd.concat([Data_Set.drop('Source', axis=1), pd.get_dummies(Data_Set['Source'])], axis=1)

In [None]:
# Licensed or Not Dummy
Data_Set['Licensed'].value_counts()
licensed = []
for row in Data_Set['Licensed']:
    if row =='None found, add some':
        licensed.append(0)
    else:
        licensed.append(1)
licensed

In [None]:
# Adding Watch Stats
Data_Set['Perc_Watched'] = (Data_Set['Watched'] + Data_Set['Completed'])/Data_Set['Total']
Data_Set['Perc_Dropped'] = (Data_Set['Dropped'])/Data_Set['Total']

In [None]:
# Removing Unused Variables
Data_Set2 = Data_Set.drop(['Title','Media_Type','Status','Aired','Broadcast','Licensed','Genres','Watched','Completed','Dropped','Action'],axis=1)

In [None]:
#Removing Unscored Shows or shows with too few votes to have been given a score
Data_Set2 = Data_Set2[Data_Set2['Score'] != 0]

In [None]:
#Pickling
with open('cleanscrape2.pickle', 'wb') as to_write:
    pickle.dump(Data_Set2, to_write)

In [None]:
#Retrieving Pickle
with open('cleanscrape2.pickle','rb') as read_file:
    new_df = pickle.load(read_file)
new_df

# Building Model

In [None]:
# Splitting Dataset
X = Data_Set2.drop('Score',1)
y = Data_Set2['Score']
#Hold 20% of the data out for testing
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Split Training Data into Validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25)

In [None]:
#set up the 3 models we're choosing from. Conduct multiple times with different splits to choose based on validation

lm = LinearRegression()

#Feature scaling for train, val, and test so that we can run our ridge model on each
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

lm_reg = Ridge(alpha=1000)

#Feature transforms for train, val, and test so that we can run our poly model on each
poly = PolynomialFeatures(degree=2) 

X_train_poly = poly.fit_transform(X_train.values)
X_val_poly = poly.transform(X_val.values)
X_test_poly = poly.transform(X_test.values)

lm_poly = LinearRegression()

In [None]:
#validation

lm.fit(X_train, y_train)
print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.3f}')

lm_reg.fit(X_train_scaled, y_train)
print(f'Ridge Regression val R^2: {lm_reg.score(X_val_scaled, y_val):.3f}')

lm_poly.fit(X_train_poly, y_train)
print(f'Degree 2 Polynomial Regression val R^2: {lm_poly.score(X_val_poly, y_val):.3f}')

In [None]:
# Looking at coefficients for polynomial features
lm_poly.fit(X_train_poly, y_train).coef_

In [None]:
# Conduct resulting test score
lm.fit(X,y)
print(f'Degree 2 Polynomial Regression test R^2: {lm_poly.score(X_test_poly, y_test):.3f}')

# Visualizations of Residual Plots

In [None]:
# Compare plots between validation and test
plt.suptitle('Val Split Residuals',fontsize = 20, weight = 'bold',color='black', y=1.05)
plt.title('Decent Prediction', fontsize=15)
plt.xlabel('Score',fontsize = 20, weight = 'bold',color='black')
plt.ylabel('Residuals',fontsize = 20, weight = 'bold',color='black')
plt.xticks(fontsize = 10, weight = 'bold',color='blue')
plt.yticks(fontsize = 10, weight = 'bold',color='blue')
y_val_pred = lm.predict(X_val)
sns.residplot(y_val_pred,y_val)
plt.savefig("Val_Residuals.png",dpi=300, bbox_inches='tight')

In [None]:
plt.suptitle('Test Split Residuals',fontsize = 20, weight = 'bold',color='black', y=1.05)
plt.title('Similar Prediction', fontsize=15)
plt.xlabel('Score',fontsize = 20, weight = 'bold',color='black')
plt.ylabel('Residuals',fontsize = 20, weight = 'bold',color='black')
plt.xticks(fontsize = 10, weight = 'bold',color='blue')
plt.yticks(fontsize = 10, weight = 'bold',color='blue')
y_test_pred = lm.predict(X_test)
sns.residplot(y_test_pred,y_test)
plt.savefig("Test_Residuals.png",dpi=300, bbox_inches='tight')