## Import Data

In [16]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from error_generator import Random_Active_Domain
from error_generator import List_selected
from error_generator import Read_Write
from error_generator import Error_Generator
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LassoCV
from yellowbrick.regressor import AlphaSelection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from yellowbrick.regressor import PredictionError

## Train-test dataset

In [17]:
anime_train = pd.read_csv("data/trainDataset.csv")
anime_test = pd.read_csv("data/testDataset.csv")

In [18]:
anime_train['genre'].fillna('',  inplace=True)
anime_train['overview'].fillna('',  inplace=True)
anime_train['type'].fillna('',  inplace=True)

anime_test['genre'].fillna('',  inplace=True)
anime_test['overview'].fillna('',  inplace=True)
anime_test['type'].fillna('',  inplace=True)

print("Anime train:", anime_train.shape)
print(anime_train.type.value_counts())

print("Anime test:", anime_test.shape)
print(anime_test.type.value_counts())

Anime train: (1600, 8)
movie        800
tv series    793
               7
Name: type, dtype: int64
Anime test: (400, 8)
movie        202
tv series    197
               1
Name: type, dtype: int64


In [19]:
#anime_train['type'].fillna('',  inplace=True)
print("Type",anime_train['type'].isnull().any())

Type False


In [6]:
for feature in list(anime_train):
    print(feature+":", anime_train[feature].isna().sum())

anime_id: 0
name: 0
genre: 0
type: 0
episodes: 50
rating: 36
members: 0
overview: 0


## Generating Errors

In [7]:
def error_generator(dataset, method, n, ignored_columns):

    myselector=List_selected()

    mygen=Error_Generator()

    new_dataset=mygen.error_generator(method_gen=method,selector=myselector,percentage=n,dataset=dataset,mute_column = ignored_columns)
    
    return new_dataset

In [8]:
# generate error in the training set
new_dataset = error_generator(anime_train.values.tolist(), Random_Active_Domain(),75, ignored_columns = [0,1,2,4,5,6,7])
Read_Write.write_csv_dataset("data/{}.csv".format(Random_Active_Domain().name), new_dataset)
anime_train = pd.read_csv("data/{}.csv".format(Random_Active_Domain().name), 
                            names = ['anime_id', 'name','genre','type', 'episodes','rating', 'members', 'overview'])
anime_train.to_csv("data/dirtyTrainDataset.csv", index = False,
                  columns = ['anime_id', 'name','genre','type', 'episodes','rating', 'members', 'overview'])

---------Change according to random_active_domain method ---------------

row: 913 col: 3 : 'movie' changed to 'tv series'  
row: 1274 col: 3 : 'movie' changed to 'tv series'  
row: 1346 col: 3 : 'movie' changed to 'movie'  
row: 917 col: 3 : 'tv series' changed to 'tv series'  
row: 1532 col: 3 : 'movie' changed to 'tv series'  
row: 1472 col: 3 : 'movie' changed to 'movie'  
row: 1290 col: 3 : 'tv series' changed to 'movie'  
row: 459 col: 3 : 'tv series' changed to 'tv series'  
row: 254 col: 3 : 'tv series' changed to 'tv series'  
row: 237 col: 3 : 'movie' changed to 'tv series'  
row: 1436 col: 3 : 'tv series' changed to 'movie'  
row: 1171 col: 3 : 'tv series' changed to 'tv series'  
row: 1199 col: 3 : 'tv series' changed to 'tv series'  
row: 270 col: 3 : 'tv series' changed to 'movie'  
row: 1328 col: 3 : 'movie' changed to 'movie'  
row: 1032 col: 3 : 'movie' changed to 'tv series'  
row: 614 col: 3 : 'tv series' changed to 'movie'  
row: 1568 col: 3 : 'movie' changed to 'tv

## Transformation

In [9]:
import nltk
def get_words(x):
    bagofwords=[]
    for i in x:
        if i[1]=='NN':
            bagofwords.append(i[0])
        elif i[1]=='NNS':
            bagofwords.append(i[0])
        elif i[1]=='NNP':
            bagofwords.append(i[0])
        elif i[1]=='NNPS':
            bagofwords.append(i[0])
        elif i[1]=='JJ':
            bagofwords.append(i[0])
        elif i[1]=='JJR':
            bagofwords.append(i[0])
        elif i[1]=='JJS':
            bagofwords.append(i[0])
        elif i[1]=='RB':
            bagofwords.append(i[0])
        elif i[1]=='RBR':
            bagofwords.append(i[0])
        elif i[1]=='RBS':
            bagofwords.append(i[0])
    return bagofwords

def clean_words(x):
    b=nltk.pos_tag(nltk.word_tokenize(x))
    result=get_words(b)
    return result

In [14]:
def get_dummies(train_data_set, test_data_set):
    train_dummies = train_data_set.genre.str.get_dummies(',')
    test_dummies = test_data_set.genre.str.get_dummies(',')
    
    print("Train Dummies",train_dummies.shape)    
    print("Test Dummies",test_dummies.shape)
    
    #### ALİGN
    train_dummies, test_dummies = train_dummies.align(test_dummies, axis=1, join='left')
    
    test_dummies.fillna(0, inplace=True)
    
    type_lb = LabelBinarizer()
    fitted_type_lb = type_lb.fit(train_data_set.type.values)
    X_train = type_lb.transform(train_data_set.type.values)
    X_test  = type_lb.transform(test_data_set.type.values)
    
    dfOneHot_train = pd.DataFrame(X_train, columns = ["movie/TVseries" for i in range(X_train.shape[1])])
    dfOneHot_test  = pd.DataFrame(X_test,  columns = ["movie/TVseries" for i in range(X_test.shape[1])])
    
    
    train_data_set = pd.concat([train_data_set, dfOneHot_train], axis=1, join="inner")
    train_data_set = pd.concat([train_data_set, train_dummies ], axis=1, join="inner")

    test_data_set = pd.concat([test_data_set, dfOneHot_test], axis=1)
    test_data_set = pd.concat([test_data_set, test_dummies],  axis=1)
    
    test_data_set['movie/TVseries'].fillna(0, inplace=True)
    train_data_set['movie/TVseries'].fillna(0, inplace=True)

    return ([train_data_set, test_data_set])

def feature_transformation(train_data_set, test_data_set):
    
    dummieset = get_dummies(train_data_set, test_data_set)
    train_data_set = dummieset[0]
    test_data_set = dummieset[1]
        
    #Bag of Words
    summary_doc_train = train_data_set['overview'].fillna("").map(clean_words)
    summary_doc_train =summary_doc_train.apply(','.join)
    
    summary_doc_test = test_data_set['overview'].fillna("").map(clean_words)
    summary_doc_test =summary_doc_test.apply(','.join)
 
    vectorizer = TfidfVectorizer()
    fitted_vectorizer = vectorizer.fit(summary_doc_train)
    overview_feature_train = fitted_vectorizer.transform(summary_doc_train).toarray()
    overview_feature_test = fitted_vectorizer.transform(summary_doc_test).toarray()

    df_train = pd.DataFrame(overview_feature_train, columns = ["word"+ str(int(i)) for i in range(overview_feature_train.shape[1])])
    train_data_set = pd.concat([train_data_set, df_train], axis=1)
    
    df_test = pd.DataFrame(overview_feature_test, columns = ["word"+ str(int(i)) for i in range(overview_feature_test.shape[1])])
    test_data_set = pd.concat([test_data_set, df_test], axis=1)
    
    train_data_set = train_data_set.drop(columns=['anime_id', 'name', 'genre', 'overview', 'type'])
    test_data_set = test_data_set.drop(columns=['anime_id', 'name', 'genre', 'overview', 'type'])
    
    #drop NaN values
    train_data_set.dropna(inplace=True)
    test_data_set.dropna(inplace=True)
    
    train_data_set.fillna(0, inplace=True)
    test_data_set.fillna(0, inplace=True)
    
    return ([train_data_set, test_data_set])


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# features vector
transformed_features = feature_transformation(anime_train, anime_test)

anime_train = transformed_features[0]
anime_test = transformed_features[1]

print(anime_train.shape)
print(anime_test.shape)

# split train test
anime_y_train = anime_train['rating']
anime_X_train = anime_train.drop(columns=['rating'])

anime_y_test = anime_test['rating']
anime_X_test = anime_test.drop(columns=['rating'])


Train Dummies (1600, 57)
Test Dummies (400, 49)


TypeError: '<' not supported between instances of 'float' and 'str'

## Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler

selector = SelectKBest(score_func=f_regression,k=700)#anime_X_test.shape[1]-1)
features = selector.fit(anime_X_train, anime_y_train)

# summarize scores
np.set_printoptions(precision=3)
#print(fit.scores_)

print(anime_X_train.shape)
print(anime_X_test.shape)
anime_X_train = features.transform(anime_X_train)
anime_X_test = features.transform(anime_X_test)
print(anime_X_train.shape)
print(anime_X_test.shape)

scaler = StandardScaler() 

# Apply transform to both the training set and the test set.
anime_X_train = scaler.fit_transform(anime_X_train)  
anime_X_test = scaler.transform(anime_X_test) 

## Training - Testing - Evaluation

In [None]:
def training(model, dataset, label):
    clf = model
    clf.fit(dataset, label)
    return clf

def testing_evaluation(model, testset):
    # Make predictions using the testing set
    anime_y_pred = model.predict(testset)
    
    # The mean squared error
    print("Mean squared error: %.2f" % np.sqrt(mean_squared_error(anime_y_test, anime_y_pred)))

    
    visualizer = PredictionError(model)
    visualizer.fit(anime_X_train, anime_y_train)  # Fit the training data to the visualizer
    visualizer.score(anime_X_test, anime_y_test)  # Evaluate the model on the test data
    g = visualizer.poof() 
    
    
    plt.plot(anime_y_test, '--g', anime_y_pred, '--b')

### Training

In [None]:
clf = training(model = linear_model.Lasso(), dataset = anime_X_train, label= anime_y_train)
testing_evaluation(clf, anime_X_test)

### Cross Validation

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
scores = cross_val_score(clf, anime_X_train, anime_y_train, scoring="neg_mean_squared_error", cv=5) 
rmse_scores = np.sqrt(-scores)
display_scores(rmse_scores)

### Grid Search For Hyper Parameter Selection

In [None]:
def checkHP(model, folds, dataset, label):
    parameters = {
                    "alpha" : [1,0.1,0.01,0.001,0.0001,0]
                  }

    gd_sr = GridSearchCV(estimator=model,  
                         param_grid=parameters,
                         scoring="neg_mean_squared_error",
                         cv=folds)

    gd_sr.fit(dataset, label)  
    
    best_parameters = gd_sr.best_params_  
    print("best parameters are: ", best_parameters)

    best_result = gd_sr.best_score_  
    print("The mean squared Error is: %.2f" % -best_result) 
    
checkHP(clf, 10, anime_X_train, anime_y_train) 

In [None]:
clf = training(model = linear_model.Lasso(alpha = 0.01), dataset = anime_X_train, label= anime_y_train)
testing_evaluation(clf, anime_X_test)