In [85]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MaxAbsScaler,OneHotEncoder,OrdinalEncoder
import lightgbm as lgb
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier

In [16]:
train = pd.read_csv('data/PS_Train.csv')
test = pd.read_csv('data/PS_Test.csv')

In [17]:
class FeatureSelector(BaseEstimator, TransformerMixin ):
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
      
    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        return X[ self._feature_names ]

In [18]:
class CustomColumnTransformer(BaseEstimator, TransformerMixin ):
    
    def __init__( self,column ):
        self.column = column
        pass

    def get_categories(self,data):
        test_cat = OneHotEncoder()
        test_cat.fit_transform(pd.DataFrame(data))
        return test_cat.categories_
      
    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        X_copy = X.copy()
        test_cat = self.get_categories(test[self.column])
        train_cat = self.get_categories(train[self.column])
        categories = np.concatenate((test_cat, train_cat),axis=1)
        categories = np.unique(categories).tolist()
        
        os_v_df = pd.DataFrame(np.zeros((X_copy.shape[0],len(categories))),columns=categories)
        X_copy = pd.concat([X_copy,os_v_df],axis=1)

        for idx,row in X_copy.iterrows():
            row[row[self.column]] = 1
            
        X_copy = X_copy.drop(columns=[self.column])
        return X_copy

In [79]:
class PriceTransformer(BaseEstimator, TransformerMixin ):
    def __init__( self ):
        pass

    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        X_copy = X.copy()
        X_copy['Price'] = X_copy.apply(lambda x: 0 if x['Price'] == 'Free' else x['Price'],axis=1 )
        X_copy['Price'] = X_copy['Price'].astype(float)
        X_copy['RR'] = X_copy.apply(lambda x: x['Rating']* x['Reviews'],axis=1)
        return X_copy

In [20]:
train.head()

Unnamed: 0,Offered_By,Category,Rating,Reviews,Size,Price,Content_Rating,Last_Updated_On,Release_Version,OS_Version_Required,Downloads
0,ps_id-24654,Finance,4.18,1481,Varies with device,Free,Everyone,May 05 2020,Varies with device,Varies with device,"100,000+"
1,ps_id-35329,Music And Audio,4.81,302,10M,Free,Everyone,Mar 26 2020,3.9.18,4.1 and up,"5,000+"
2,ps_id-11044,Game Casual,4.27,374,27M,Free,Everyone,May 01 2020,1.10.1,4.1 and up,"10,000+"
3,ps_id-36068,Business,4.03,122058,Varies with device,Free,Teen,May 02 2020,Varies with device,Varies with device,"10,000,000+"
4,ps_id-35831,Medical,4.6,358,Varies with device,297.5742,Everyone,Nov 29 2018,Varies with device,Varies with device,"5,000+"


In [21]:
for col in ['Category','Size','Content_Rating','Release_Version','OS_Version_Required','Downloads','Last_Updated_On']:
    print(f"{col} : {train[col].value_counts().shape}")

Category : (51,)
Size : (439,)
Content_Rating : (6,)
Release_Version : (4190,)
OS_Version_Required : (27,)
Downloads : (18,)
Last_Updated_On : (1583,)


In [22]:
X_train = train.loc[:,train.columns != 'Downloads']
y_train = train.loc[:,train.columns == 'Downloads']

In [117]:
columns = ['Category','RR','Price','Content_Rating']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat1', OneHotEncoder(), ['Content_Rating','Category']) ,
        ],
        remainder ='passthrough')

In [118]:
pipeline = make_pipeline(PriceTransformer(),FeatureSelector(columns),preprocessor)
X_train_enc = pipeline.fit_transform(X_train)

In [98]:
X_train_enc.shape

(16516, 4)

In [87]:
def measure(model):
    model.fit(X_train_enc,y_train)
    print(log_loss(y_train,model.predict_proba(X_train_enc)))

def cross_val(model):
    accuracy = cross_val_score(model,X_train_enc,y_train,cv=5,scoring='accuracy')
    print(accuracy.mean() * 100)

def save(model):
    test_enc = pipeline.transform(test)
    pd.DataFrame(model.predict_proba(test_enc)).to_csv('data/submission.csv',index=False)

In [86]:
gbm = lgb.LGBMClassifier()
measure(gbm)

2.152368468865916


In [119]:
cat = CatBoostClassifier(verbose=False)
measure(cat)

1.0233859703780168


In [111]:
save(cat)