# Data Cleaning and Machine Learning 

## Reading the New Integrated Sample

In [254]:
import numpy as np
import imdb 
import pandas as pd
anime_data = pd.read_csv("C://Users//Master//new_anime_data.csv")

anime_data['episodes'] = anime_data['episodes'].replace('Unknown', np.nan)
##drop Unknown values
#anime_data = anime_data[~anime_data.episodes.str.contains("Unknown")]


## Data Transformation

### Drop Some Special Characters from "genre" and "overview", and Covert "type" to Movie/Tv series

In [255]:
l1 = []
l2 = []
l3 = []
for item in anime_data['genre']:
    if isinstance(item, (list, tuple)):
        item = ','.join(item)
    else:
        
        item = item.replace(" ","")
        item = item.replace("[","")
        item = item.replace("]","")
        item = item.replace("'","")
    l1.append(item) 

for item in anime_data['overview']:
    if isinstance(item, (list, tuple)):
        item = ','.join(item)
    else:
        item = item.replace("[","")
        item = item.replace("]","")
    l2.append(item) 
   
    
for item in anime_data['type']:
    if "movie" in item:
        item = "movie"
    else:
        item = "tv series"
    l3.append(item)   
    
    
anime_data['genre'] = l1   
anime_data['overview'] = l2
anime_data['type'] = l3

#drop dublicate
anime_data.drop_duplicates(inplace = True)

#fill None with np.nan
anime_data.fillna(value=pd.np.nan, inplace=True)

### Applying One-Hot and Binary Encoding

In [256]:
from sklearn.preprocessing import LabelBinarizer

#one-hot encoding to transform the genres column to numerical columns
df = anime_data.genre.str.get_dummies(',')

#binary encoding for type column
type_lb = LabelBinarizer()
X = type_lb.fit_transform(anime_data.type.values)

dfOneHot = pd.DataFrame(X, columns = ["movie/TVseries" for i in range(X.shape[1])])
anime_data = pd.concat([anime_data, dfOneHot], axis=1)
anime_data = pd.concat([anime_data, df], axis=1)
#anime_data = anime_data.dropna(axis = 'rows')

### Counting words "overview"

In [257]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer() 
overview_feature = vectorizer.fit_transform(anime_data['overview']).toarray()

df = pd.DataFrame(overview_feature, columns = ["word"+ str(int(i)) for i in range(overview_feature.shape[1])])
anime_data = pd.concat([anime_data, df], axis=1)

#drop Null values
#anime_data = anime_data.dropna(axis = 'rows')


## Feature Extraction

In [258]:
anime_data = anime_data.drop(columns=['anime_id', 'name', 'genre', 'overview', 'type'])
anime_data.head(5)

Unnamed: 0.1,Unnamed: 0,episodes,rating,members,movie/TVseries,Action,Adventure,Animation,Biography,Comedy,...,word2526,word2527,word2528,word2529,word2530,word2531,word2532,word2533,word2534,word2535
0,1,64,9.26,793665,1,1,1,1,0,1,...,1,0,0,0,0,0,0,0,0,0
1,2,51,9.25,114262,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,24,9.17,673572,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,51,9.16,151266,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,10,9.15,93351,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Replace Null Values With the Most Frequent Value

In [259]:
from sklearn.impute import SimpleImputer

#anime_data.fillna(value=pd.np.nan, inplace=True)
imp = SimpleImputer(strategy="most_frequent")
feature_vect = imp.fit_transform(anime_data)

df = pd.DataFrame(feature_vect, columns = ["feature"+ str(int(i)) for i in range(feature_vect.shape[1])])
anime_data = df

### Replace Null Values with Mean

In [161]:
from sklearn.impute import SimpleImputer

#anime_data.fillna(value=pd.np.nan, inplace=True)
imp = SimpleImputer(strategy="mean")
feature_vect = imp.fit_transform(anime_data)

df = pd.DataFrame(feature_vect, columns = ["feature"+ str(int(i)) for i in range(feature_vect.shape[1])])
anime_data = df

## Modeling

In [260]:
from sklearn import preprocessing

y = anime_data['feature2']
#lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = y #lab_enc.fit_transform(y)

X = anime_data.drop(columns=['feature2'])

# Split the data into training/testing sets
anime_X_train = X[:-20]
anime_X_test = X[-20:]

# Split the targets into training/testing sets
anime_y_train = training_scores_encoded[:-20]
anime_y_test = training_scores_encoded[-20:]

### Generate Random Values, add it to "members" Attribute

In [264]:
import warnings
warnings.filterwarnings("ignore")

n_samples = anime_X_train.shape[0]
n_features = anime_X_train.shape[1]

rand_rate = 0.50
n_rand_samples = int(np.floor(n_samples * rand_rate))

mu, sigma = anime_X_train['feature3'].mean(), anime_X_train['feature3'].std()

anime_X_train['feature3'].iloc[0:n_rand_samples] = abs(np.random.normal(mu, sigma, size=n_rand_samples))


### Training and Testing

In [262]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(anime_X_train, anime_y_train)

# Make predictions using the testing set
anime_y_pred = regr.predict(anime_X_test)


## Evaluation

In [263]:
# The coefficients
print('Coefficients: \n', regr.coef_)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(anime_y_test, anime_y_pred))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(anime_y_test, anime_y_pred))


Coefficients: 
 [-9.22331394e-03 -5.06933235e-04 -2.71003344e-08 ... -6.37743006e-04
 -2.36665751e-03  8.24986932e-05]
Mean squared error: 0.04
Variance score: -101.90
