In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import time
from sklearn.cluster import MeanShift, KMeans
import matplotlib.pyplot as plt

# 1. Prepare Data

## 1.1 Data load and memory reduction

In [None]:
%%time
train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')[:40000]
target=train['target']
features = [f'f_{i}' for i in range(300)]
train=train[features]

#display(train.info(), train.head())

In [None]:
train


In [None]:
def reduce_mem_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2

    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)



    end_mem = df.memory_usage().sum() / 1024**2
    print('απο ',start_mem,'πήγε ',end_mem)
    return df

In [None]:
train=reduce_mem_usage(train)

# 2. Clustering

In [None]:
pca = PCA(.98)
scaler = StandardScaler()
train=scaler.fit_transform(train)
train=pca.fit_transform(train)
train=pd.DataFrame(train) 

In [None]:
train

In [None]:
intertia = []
K = range(1,7)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(train)
    intertia.append(km.inertia_)
plt.plot(K, intertia, marker= "x")
plt.xlabel('k')
plt.xticks(np.arange(7))
plt.ylabel('Intertia')
plt.title('Elbow Method')
plt.show()

In [None]:
km = KMeans(n_clusters=3)
km = km.fit(train.astype('double'))
train['cluster']=km.predict(train.astype('double'))

In [None]:
train.groupby(['cluster']).count()

In [None]:
for i in train['cluster'].unique():print(i)

# 3. Train Models

In [None]:
def rmse(predict, actual):
    predict = np.array(predict)
    actual = np.array(actual)

    distance = predict - actual

    square_distance = distance ** 2

    mean_square_distance = square_distance.mean()

    score = np.sqrt(mean_square_distance)

    return score

rmse_score = make_scorer(rmse, greater_is_better = False)

In [None]:
def model_training(train_data):
    X = train_data
    y = train_data['target']
    del X['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.95, test_size=0.05,shuffle=True)
    
#     grid = GridSearchCV(SVR(),{'C': [10],'gamma': [1e-8],'epsilon':[0.001],'kernel': ['rbf']}, cv=10, return_train_score=False, verbose = 0,n_jobs=3,scoring=rmse_score)
#     grid.fit(X_train,y_train)
#     pp=grid.best_params_
#     print(pp)
    pp={'n_estimators':1000, 'max_depth':7, 'eta':0.1, 'subsample':0.7, 'colsample_bytree':0.8}
    model=XGBRegressor()
    model.set_params(**pp)
    
    model=model.fit(X_train,y_train)

    return(model,X_test,y_test)

In [None]:
def get_prediction(data,model):
    prediction=model.predict(data)
    return(prediction)
    

In [None]:
model_dict={}

In [None]:
%%time
train['target']=target
for i in range(len(train['cluster'].unique())):
    print(i)
    train_data=train[train['cluster']==i]
    del train_data['cluster']
    print(train_data.shape)
    model,X_test,y_test=model_training(train_data)
    model_dict[i]=model
    

In [None]:
%%time
data_cluster=km.predict(X_test.astype('double'))
model=model_dict[data_cluster[0]]
pred = get_prediction(X_test,model)

pred

In [None]:
# evaluate
pearsonr(y_test, pred)[0]

# 4. Submission

In [None]:
def prepare_data(data):
    features = [f'f_{i}' for i in range(300)]
    data=data[features]
    data=scaler.transform(data)
    data=pca.transform(data)
    data=pd.DataFrame(data)
    return(data)
    

In [None]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    #try:
        test_df_for_prediction=prepare_data(test_df)#scaling,PCA, clustering
        #test_df_a for_prediction['row_id']=test_df['row_id']
        #display(test_df)

#         for i in test_df_for_prediction['cluster'].unique():
#             pred_data=test_df_for_prediction[test_df_for_prediction['cluster']==i]#prediction for each cluster
#             cluster_row_id=pred_data['row_id']
#             #display(pred_data)

#             del pred_data['row_id']
#             del pred_data['cluster']
#             model=model_dict[i]
#             #sample_prediction_df.loc[sample_prediction_df['row_id']==cluster_row_id,'target'] = get_prediction(pred_data,model)  # make your predictions here
#             sample_prediction_df.iloc[cluster_row_id,'target'] = get_prediction(pred_data,model)  # make your predictions here
#             #display(sample_prediction_df)
        for i in range(len(test_df_for_prediction)):
            model=model_dict[km.predict(test_df_for_prediction.iloc[i].astype('double').values.reshape(-1, 1))]
            sample_prediction_df['target'].iloc[i] = get_prediction(test_df_for_prediction.iloc[i],model) 
                             
    #finally:
#         sample_prediction_df.replace([np.inf, -np.inf], np.nan, inplace=True)   
#         sample_prediction_df.fillna(0,inplace=True)
        env.predict(sample_prediction_df.astype({'row_id': 'str','target':'float64'}))   # register your predictions
