# Data Cleaning and Machine Learning 

## Reading the New Integrated Sample

In [1]:
import numpy as np
import imdb 
import pandas as pd
anime_data = pd.read_csv("C://Users//Master//new_anime_data1.csv")

anime_data['episodes'] = anime_data['episodes'].replace('Unknown', np.nan)


## Data Transformation

### Drop Some Special Characters from "genre" and "overview", and Covert "type" to Movie/Tv series

In [2]:
l1 = []
l2 = []
l3 = []


anime_data['genre'].fillna('',  inplace=True)
anime_data['overview'].fillna('',  inplace=True)
anime_data['type'].fillna('',  inplace=True)
for index, row in anime_data.iterrows():
    item = row['genre']
    if(pd.isnull(item)):
            item =""
    else:
        if isinstance(item, (list, tuple)):
            item = ','.join(item)
        else:
            item = item.replace(" ","")
            item = item.replace("[","")
            item = item.replace("]","")
            item = item.replace("'","")
    l1.append(item) 
      

for index, row in anime_data.iterrows():
    item = row['overview']
    if(pd.isnull(row['overview'])):
            item = ""
    else:
       # for item in anime_data['overview']:
            if isinstance(item, (list, tuple)):
                item = ','.join(item)
            else:
                item = item.replace("[","")
                item = item.replace("]","")
    l2.append(item) 

for index, row in anime_data.iterrows():
    item = row['type']
    if(pd.isnull(row['type'])):
        item = np.nan
    else:    
        if "movie" in item:
            item = "movie"
        else:
            item = "tv series"
    l3.append(item)   
    
    
anime_data['genre'] = l1   
anime_data['overview'] = l2
anime_data['type'] = l3

#drop dublicate
#anime_data.drop_duplicates(inplace = True)

### Applying One-Hot and Binary Encoding

In [3]:
from sklearn.preprocessing import LabelBinarizer

anime_data = anime_data.dropna()

#one-hot encoding to transform the genres column to numerical columns
df = anime_data.genre.str.get_dummies(',')

#binary encoding for type column
type_lb = LabelBinarizer()
X = type_lb.fit_transform(anime_data.type.values)


dfOneHot = pd.DataFrame(X, columns = ["movie/TVseries" for i in range(X.shape[1])])
anime_data = pd.concat([anime_data, dfOneHot], axis=1)
anime_data = pd.concat([anime_data, df], axis=1)

anime_data['movie/TVseries'].fillna(0, inplace=True)

#anime_data

### Counting words "overview"

In [4]:
import nltk
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
def get_words(x):
    bagofwords=[]
    for i in x:
        if i[1]=='NN':
            bagofwords.append(i[0])
        elif i[1]=='NNS':
            bagofwords.append(i[0])
        elif i[1]=='NNP':
            bagofwords.append(i[0])
        elif i[1]=='NNPS':
            bagofwords.append(i[0])
        elif i[1]=='JJ':
            bagofwords.append(i[0])
        elif i[1]=='JJR':
            bagofwords.append(i[0])
        elif i[1]=='JJS':
            bagofwords.append(i[0])
        elif i[1]=='RB':
            bagofwords.append(i[0])
        elif i[1]=='RBR':
            bagofwords.append(i[0])
        elif i[1]=='RBS':
            bagofwords.append(i[0])
    return bagofwords

def clean_words(x):
    b=nltk.pos_tag(nltk.word_tokenize(x))
    result=get_words(b)
    return result

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

summary_doc = anime_data['overview'].fillna("").map(clean_words)
summary_doc =summary_doc.apply(','.join)
 
vectorizer = TfidfVectorizer()
overview_feature = vectorizer.fit_transform(summary_doc).toarray()
#overview_feature = vectorizer.fit_transform(summary_doc)


df = pd.DataFrame(overview_feature, columns = ["word"+ str(int(i)) for i in range(overview_feature.shape[1])])
anime_data = pd.concat([anime_data, df], axis=1)

#drop Null values
#anime_data = anime_data.dropna(inplace=True)

## Feature Extraction

In [6]:
anime_data = anime_data.drop(columns=['Unnamed: 0', 'anime_id', 'name', 'genre', 'overview', 'type'])


### Drop None Values

In [7]:


anime_data = anime_data.dropna()
print(anime_data.shape)


(1930, 7743)


## Modeling

In [8]:
from sklearn import preprocessing

y = anime_data['rating']
X = anime_data.drop(columns=['rating'])

## Selecting Best Features

### 1) Filter Method - Univariate Selection

In [24]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

selector = SelectKBest(score_func=f_regression, k=1000)
fit = selector.fit(X, y)
# summarize scores
np.set_printoptions(precision=3)
#print(fit.scores_)
features = fit.transform(X)
print(features.shape)
print(X.shape)

(1930, 1000)
(1930, 7742)


## Spliting the Data

In [25]:
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import StandardScaler 

anime_X_train, anime_X_test, anime_y_train, anime_y_test = train_test_split(features, y, test_size=0.3, random_state=0) 
 
scaler = StandardScaler() 

# Apply transform to both the training set and the test set.
anime_X_train = scaler.fit_transform(anime_X_train)  
anime_X_test = scaler.transform(anime_X_test) 




## Training and Testing

In [26]:
# Create linear regression object
from sklearn import linear_model

regr = linear_model.LinearRegression()

In [27]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [32]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split  

scores = cross_val_score(regr, anime_X_train, anime_y_train, scoring="neg_mean_squared_error", cv=10)

## Evaluation

In [33]:
rmse_scores = np.sqrt(-scores)
display_scores(rmse_scores)

Scores: [8.050e+13 9.332e+13 6.578e+13 9.215e+13 4.538e+13 9.858e+13 1.111e+14
 1.163e+14 8.626e+13 1.154e+14]
Mean: 90474844203271.69
Standard deviation: 21328913742470.11


## Grid Search For Hyper Parameter Selection

In [34]:
from sklearn.model_selection import GridSearchCV

parameters = {
                'fit_intercept':[True,False], 
                'normalize':[True,False], 
                'copy_X':[True, False]
              }

gd_sr = GridSearchCV(estimator=regr,  
                     param_grid=parameters,
                     scoring="neg_mean_squared_error",
                     cv=10)

gd_sr.fit(anime_X_train, anime_y_train)  

best_parameters = gd_sr.best_params_  
print(best_parameters)  



{'copy_X': True, 'fit_intercept': False, 'normalize': True}


In [35]:
best_result = gd_sr.best_score_  
print(-best_result) 

3.633810040538749e+27


## Re-Training

In [36]:
from sklearn import metrics
regr = linear_model.LinearRegression(fit_intercept = False, normalize = True, copy_X = True )

# Train the model using the training sets
regr.fit(anime_X_train, anime_y_train)

# Make predictions using the testing set
anime_y_pred = regr.predict(anime_X_test)

# The mean squared error
print("Mean squared error: %.2f" % np.sqrt(metrics.mean_squared_error(anime_y_test, anime_y_pred)))

Mean squared error: 517195875266821.25
