### Import Libraries and datasets

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
# import datasets
Train = pd.read_csv('Data_Train.csv')
Test = pd.read_csv('Data_Test.csv')
sample = pd.read_excel('Sample_Submission.xlsx')

### Explore Data

In [3]:
print(Train.shape)
Train.head()

(78458, 11)


Unnamed: 0,Unique_ID,Name,Genre,Country,Song_Name,Timestamp,Views,Comments,Likes,Popularity,Followers
0,413890,Hardstyle,danceedm,AU,N-Vitral presents BOMBSQUAD - Poison Spitter (...,2018-03-30 15:24:45.000000,14017,4,499,97,119563
1,249453,Dj Aladdin,danceedm,AU,Dj Aladdin - Old School Hip Hop Quick Mix,2016-06-20 05:58:52.000000,1918,17,49,17,2141
2,681116,Maxximize On Air,danceedm,AU,Maxximize On Air - Mixed by Blasterjaxx - Epis...,2015-05-08 17:45:59.000000,9668,11,312,91,22248
3,387253,GR6 EXPLODE,rbsoul,AU,MC Yago - Tenho Compromisso (DJ R7),2017-06-08 23:50:03.000000,113036,2,2400,76,393655
4,1428029,Tritonal,danceedm,AU,Escape (feat. Steph Jones),2016-09-17 20:50:19.000000,110024,81,3031,699,201030


In [4]:
print(Test.shape)
Test.head()

(19615, 10)


Unnamed: 0,Unique_ID,Name,Genre,Country,Song_Name,Timestamp,Comments,Likes,Popularity,Followers
0,562546,L2Share♫79,all-music,AU,엔플라잉 N.Flying - Rooftop (옥탑방),2019-01-02 09:49:19.000000,26,5428,118,2568
1,907584,Morgan Page,danceedm,AU,Morgan Page - In The Air - Episode 246,2015-03-09 17:30:10.000000,18,738,195,155645
2,213013,Dirty Workz,danceedm,AU,Outlander - Devotion (Official HQ Preview),2013-09-18 16:09:03.000000,18,386,96,29446
3,340312,Gucci Mane,all-music,AU,Freaky Gurl,2007-08-14 00:00:00.000000,1,388,19,799410
4,41854,Angga Kecap,rock,AU,Radical Nusantara - Teman,2018-08-10 16:28:54.000000,0,79,1,2


In [5]:
Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78458 entries, 0 to 78457
Data columns (total 11 columns):
Unique_ID     78458 non-null int64
Name          78458 non-null object
Genre         78458 non-null object
Country       78458 non-null object
Song_Name     78457 non-null object
Timestamp     78458 non-null object
Views         78458 non-null int64
Comments      78458 non-null int64
Likes         78458 non-null object
Popularity    78458 non-null object
Followers     78458 non-null int64
dtypes: int64(4), object(7)
memory usage: 6.6+ MB


In [6]:
# Function to find non numerals in the series
def non_numerals(series):
    non_numerals = []
    for i in series.unique():
        try:
            i = float(i)
        except:
            non_numerals.append(i)
    return non_numerals

### Data Preprocessing

In [7]:
# drops unwanted features from train and test set
Train.drop(['Country','Song_Name','Unique_ID'], axis=1, inplace=True)
Test.drop(['Country','Song_Name','Unique_ID'], axis=1, inplace=True)

In [8]:
# Saperate X_train and y_train
X_train = Train[['Name','Genre','Timestamp','Comments','Likes','Popularity','Followers']]
y_train = Train['Views']
test = Test

In [9]:
# Function for clean likes and popularity in dataset
def Clean_numbers(data):
    data = data.apply(lambda x: x.replace(',',''))
    a = 'K'
    b = 'M'
    data = data.apply(lambda x: int(float(x.replace('K',''))*1000) if a in x else(int(float(x.replace('M',''))*1000000) if b in x else int(x)))
    
    return data

In [None]:
# Function for clean all dataset of training and testing set
def Clean(data):
    label_encoder1 = LabelEncoder()
    data['Name'] = label_encoder1.fit_transform(data['Name'])
    label_encoder2 = LabelEncoder()
    data['Genre'] = label_encoder2.fit_transform(data['Genre'])
    
    data.Timestamp = pd.to_datetime(data['Timestamp'])
    data['Timestamp'] = data['Timestamp'].apply(lambda x: x.year)
    
    data.Likes = Clean_numbers(data.Likes)
    data.Popularity = Clean_numbers(data.Popularity)
    
    
    return data

In [None]:
X_train = Clean(X_train)
X_test = Clean(test)

In [None]:
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values

### Creating Train_test_split for validation

In [None]:
from sklearn.model_selection import train_test_split

train_x, val_x, train_y, val_y = train_test_split(X_train,y_train, test_size = 0.2, random_state = 123)

In [None]:
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)

### validation using RMLSE

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=300, random_state=0)
regressor.fit(train_x, train_y)

y_pred = regressor.predict(val_x)
y_true = val_y

error = np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean() ** 0.5
score = 1 - error

print("RMLSE Score = ", score)

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(train_x, train_y)

y_pred = regressor.predict(val_x)
y_true = val_y

error = np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean() ** 0.5
score = 1 - error

print("RMLSE Score = ", score)

### Model Building

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=300, random_state=0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
y_pred = y_pred.astype(int)

In [None]:
info={}
info['Unique_ID'] = Test['Unique_ID']
info['Views'] = y_pred
data_frame = pd.DataFrame(info)
data_frame.to_excel('Sol1.xlsx', index=False)

In [None]:
sample.tail()

In [None]:
data_frame.head()

In [None]:
Test.head()