# Data Cleaning and Machine Learning 

## Reading the New Integrated Sample

In [1]:
import numpy as np
import imdb 
import pandas as pd
anime_data = pd.read_csv("new_anime_data.csv")

anime_data['episodes'] = anime_data['episodes'].replace('Unknown', np.nan)


## Data Transformation

### Drop Some Special Characters from "genre" and "overview", and Covert "type" to Movie/Tv series

In [2]:
l1 = []
l2 = []
l3 = []


anime_data['genre'].fillna('',  inplace=True)
anime_data['overview'].fillna('',  inplace=True)
anime_data['type'].fillna('',  inplace=True)
for index, row in anime_data.iterrows():
    item = row['genre']
    if(pd.isnull(item)):
            item =""
    else:
        if isinstance(item, (list, tuple)):
            item = ','.join(item)
        else:
            item = item.replace(" ","")
            item = item.replace("[","")
            item = item.replace("]","")
            item = item.replace("'","")
    l1.append(item) 
      

for index, row in anime_data.iterrows():
    item = row['overview']
    if(pd.isnull(row['overview'])):
            item = ""
    else:
       # for item in anime_data['overview']:
            if isinstance(item, (list, tuple)):
                item = ','.join(item)
            else:
                item = item.replace("[","")
                item = item.replace("]","")
    l2.append(item) 

for index, row in anime_data.iterrows():
    item = row['type']
    if(pd.isnull(row['type'])):
        item = np.nan
    else:    
        if "movie" in item:
            item = "movie"
        else:
            item = "tv series"
    l3.append(item)   
    
    
anime_data['genre'] = l1   
anime_data['overview'] = l2
anime_data['type'] = l3

#drop dublicate
#anime_data.drop_duplicates(inplace = True)

### Applying One-Hot and Binary Encoding

In [3]:
from sklearn.preprocessing import LabelBinarizer

anime_data = anime_data.dropna()

#one-hot encoding to transform the genres column to numerical columns
df = anime_data.genre.str.get_dummies(',')

#binary encoding for type column
type_lb = LabelBinarizer()
X = type_lb.fit_transform(anime_data.type.values)


dfOneHot = pd.DataFrame(X, columns = ["movie/TVseries" for i in range(X.shape[1])])
anime_data = pd.concat([anime_data, dfOneHot], axis=1)
anime_data = pd.concat([anime_data, df], axis=1)

anime_data['movie/TVseries'].fillna(0, inplace=True)

#anime_data.shape

### Counting words "overview"

In [4]:
import nltk
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
def get_words(x):
    bagofwords=[]
    for i in x:
        if i[1]=='NN':
            bagofwords.append(i[0])
        elif i[1]=='NNS':
            bagofwords.append(i[0])
        elif i[1]=='NNP':
            bagofwords.append(i[0])
        elif i[1]=='NNPS':
            bagofwords.append(i[0])
        elif i[1]=='JJ':
            bagofwords.append(i[0])
        elif i[1]=='JJR':
            bagofwords.append(i[0])
        elif i[1]=='JJS':
            bagofwords.append(i[0])
        elif i[1]=='RB':
            bagofwords.append(i[0])
        elif i[1]=='RBR':
            bagofwords.append(i[0])
        elif i[1]=='RBS':
            bagofwords.append(i[0])
    return bagofwords

def clean_words(x):
    b=nltk.pos_tag(nltk.word_tokenize(x))
    result=get_words(b)
    return result

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

summary_doc = anime_data['overview'].fillna("").map(clean_words)
summary_doc =summary_doc.apply(','.join)
 
vectorizer = TfidfVectorizer()
overview_feature = vectorizer.fit_transform(summary_doc).toarray()
#overview_feature = vectorizer.fit_transform(summary_doc)


df = pd.DataFrame(overview_feature, columns = ["word"+ str(int(i)) for i in range(overview_feature.shape[1])])
anime_data = pd.concat([anime_data, df], axis=1)

#drop Null values
#anime_data = anime_data.dropna(inplace=True)

In [6]:
'''
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
overview_feature = vectorizer.fit_transform(anime_data['overview']).toarray()

df = pd.DataFrame(overview_feature, columns = ["word"+ str(int(i)) for i in range(overview_feature.shape[1])])
anime_data = pd.concat([anime_data, df], axis=1)
'''
anime_data.shape

(2000, 7810)

## Feature Extraction

In [7]:
anime_data = anime_data.drop(columns=['Unnamed: 0', 'anime_id', 'name', 'genre', 'overview', 'type'])


In [8]:
anime_data.shape

(2000, 7804)

### Drop None Values

In [9]:
anime_data = anime_data.dropna()
print(anime_data.shape)

(1937, 7804)


In [10]:
anime_data.head()

Unnamed: 0,episodes,rating,members,movie/TVseries,Action,Adventure,Animation,Biography,Cars,Comedy,...,word7733,word7734,word7735,word7736,word7737,word7738,word7739,word7740,word7741,word7742
0,13,6.83,429.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,5.83,2899.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,6.0,161.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,5.41,713.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,8.13,20629.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Generate Errors

In [17]:
from error_generator import Explicit_Missing_Value
from error_generator import Implicit_Missing_Value
from error_generator import White_Noise
from error_generator import Gaussian_Noise
from error_generator import Random_Active_Domain
from error_generator import Similar_Based_Active_Domain
from error_generator import Typo_Keyboard
from error_generator import Typo_Butterfingers
from error_generator import Word2vec_Nearest_Neighbor
from error_generator import Value_Selector
from error_generator import List_selected
from error_generator import Read_Write
from error_generator import Error_Generator


In [18]:
#error_method=Typo_Keyboard()
# mymethod=Typo_Butterfingers()


# mymethod=Similar_Based_Active_Domain()
# mymethod=Random_Active_Domain()


# mymethod=White_Noise()
# mymethod=Gaussian_Noise()

mymethod=Implicit_Missing_Value()
# mymethod=Explicit_Missing_Value()

# mymethod=Word2vec_Nearest_Neighbor()

In [19]:
myselector=List_selected()
mygen=Error_Generator()

In [20]:
new_dataset=mygen.error_generator(method_gen=mymethod,selector=myselector,percentage=10,dataset=anime_data,mute_column=[1,2])

KeyError: 0

## Modeling

In [121]:
from sklearn import preprocessing

y = anime_data['rating']
X = anime_data.drop(columns=['rating'])

## Selecting Best Features

### 1) Filter Method - Univariate Selection

In [122]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

selector = SelectKBest(score_func=f_regression, k=778)
fit = selector.fit(X, y)
# summarize scores
np.set_printoptions(precision=3)
#print(fit.scores_)
features = fit.transform(X)
print(features.shape)
print(X.shape)

(1930, 778)
(1930, 7742)


## Spliting the Data

In [123]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training/testing sets
anime_X_train = features[:-500]
anime_X_test = features[-500:]

# Split the targets into training/testing sets
anime_y_train = y[:-500]
anime_y_test = y[-500:]


## Training and Testing

In [124]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(anime_X_train, anime_y_train)

# Make predictions using the testing set
anime_y_pred = regr.predict(anime_X_test)

## Evaluation

In [125]:
#accurcy
print('Accurcy: \n', regr.intercept_)

# The coefficients
print('Coefficients: \n', regr.coef_)

# The mean squared error
print("Mean squared error: %.2f" % np.sqrt(mean_squared_error(anime_y_test, anime_y_pred)))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(anime_y_test, anime_y_pred))



Accurcy: 
 6.4572730304813275
Coefficients: 
 [ 3.803e-04  7.100e-06 -1.516e-01  1.581e-02 -2.559e-02 -1.224e+00
 -1.509e+00  8.172e-03 -4.687e-01 -3.087e+00  1.644e+01 -6.510e-01
 -1.070e+01 -9.408e-11 -1.027e-10  3.600e-01  2.350e-01 -1.452e-01
  1.709e+01 -1.016e-10 -3.600e-11 -4.687e-01 -1.452e-01  1.922e-11
 -1.452e-01 -4.471e+00 -1.271e+00 -1.416e-01 -7.231e-11 -7.544e-01
 -3.921e-01  7.453e-01 -1.815e-11 -1.456e-11  1.669e+00  4.713e+00
 -8.768e-01 -9.373e-01  8.064e-01  1.619e+00 -3.354e-11 -3.331e-01
 -7.965e-12 -3.921e-01 -4.687e-01  9.252e-01  1.488e+00 -3.399e-12
  6.819e-01  4.982e-12 -1.093e-11  1.488e+00  1.488e+00  1.941e-11
 -7.246e+00 -4.693e-01  3.600e-01 -1.526e+00  2.334e+00 -3.623e+00
  2.933e+00 -3.648e+00  1.526e-12 -1.455e-01 -1.575e+00  6.969e-01
  1.800e-12 -1.452e-01  1.454e+01  4.141e+01 -4.500e-01 -2.832e-01
 -1.452e-01 -2.832e-01  1.046e+00  4.086e-02 -2.256e+01  2.482e-12
  3.211e+00 -4.687e-01  1.003e+00  3.033e+00  9.610e-13  8.172e-03
 -4.687e-01  6.8