In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install tensorflow==2.12.0 deepctr optuna



In [3]:
!pip install deepctr



In [4]:
!pip install eda-report



In [5]:
!pip install pandas-profiling[notebook]



In [6]:
import pandas as pd
import eda_report
import numpy as np
from ydata_profiling import ProfileReport
import psycopg2 as pg
from sklearn.metrics import log_loss, roc_auc_score,mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score
from deepctr.models import *
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
from bayes_opt import BayesianOptimization

def check_column_types():
    pass


def get_db_info(dbtype):
    pass

def difference_in_minutes(min_date,dt):
      return int((dt - min_date).total_seconds() / 60)

def process_datetime(df):

  datetime_cols = []

  for col in df.columns:
      try:
          df[col] = pd.to_datetime(df[col])
          datetime_cols.append(col)
      except (ValueError, TypeError):
          pass

  min_date = min([df[col].min() for col in datetime_cols])


  for col in datetime_cols:
      df[col] = df[col].apply(lambda x: difference_in_minutes(min_date,x))

  return df


def define_structure(data,structure):
    rating_ind=structure["rating"]
    col_user_ind=structure["user"]
    col_item_ind=structure["item"]
    user_features_ind=structure["user_features"]
    item_features_ind=structure["item_features"]
    print(rating_ind, rating_ind,col_user_ind,col_item_ind,user_features_ind,item_features_ind)
    columns=list(data.columns)
    columns_final=([rating_ind,col_user_ind,col_item_ind]+[columns[i] for i in user_features_ind+item_features_ind])
    for i in columns_final:
      print(i)
    return data[columns_final]



def MVP(data,missing_type):
    pass

def EDA(data):
    profile = ProfileReport(data, title="EDA Report")

    # Display the report in a Jupyter Notebook
    profile.to_notebook_iframe()
    profile.to_file("eda_report.html")


def classify_columns(df):
    num_cols = []
    str_cols = []

    for col in df.columns:
        try:
            df[col] = df[col].apply(pd.to_numeric, errors='raise')
            num_cols.append(col)
        except ValueError:
            str_cols.append(col)

    return num_cols, str_cols

def Train_Test_Split(X,y):
  return train_test_split(X,y,stratify=y)


def prediction(data,f="/content/best_model.h5"):

        dense_features, sparse_features=classify_columns(data[data.columns[1:]])
        target = data.columns[0]


        for feat in sparse_features:
            lbe = LabelEncoder()
            data[feat] = lbe.fit_transform(data[feat])


        mms = MinMaxScaler(feature_range=(0, 1))
        data[dense_features] = mms.fit_transform(data[dense_features])

        fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=4)
                                for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1, )
                                                                                for feat in dense_features]

        dnn_feature_columns = fixlen_feature_columns
        linear_feature_columns = fixlen_feature_columns

        feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
        model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
        model.compile("adam", "mse",
                        metrics=['mse'])
        model_input = {name:data[name].values for name in feature_names}

        model.load_weights(f)
        print(mean_squared_error(model.predict(model_input,batch_size=256),data[target]))

def modelling(data,trained,model_path="xdeepfm"):

        dense_features, sparse_features=classify_columns(data[data.columns[1:]])
        target = data.columns[0]


        for feat in sparse_features:
            lbe = LabelEncoder()
            data[feat] = lbe.fit_transform(data[feat])


        mms = MinMaxScaler(feature_range=(0, 1))
        data[dense_features] = mms.fit_transform(data[dense_features])

        fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=4)
                                for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1, )
                                                                                for feat in dense_features]

        dnn_feature_columns = fixlen_feature_columns
        linear_feature_columns = fixlen_feature_columns

        feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
        model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
        model.compile("adam", "mse",
                        metrics=['mse'])
        model_input = {name:data[name].values for name in feature_names}

        if trained:
          model.load_weights(model_path)
        else:
          model.fit(model_input, data[target], batch_size=256, epochs=1, verbose=2)
          model.save_weights("best_model.h5")

def main(col_user,col_item,data=np.nan,items=np.nan,users=np.nan,triplets=np.nan,num_user_features=0,num_item_features=0,only_triplet=True,custom_structure=True,structure={},missing_value=[],trained=None,str_rating=False):
    """

    Args:
        (str)data:(address) full dataset on which model is trained,test and tuned
        col_user:name of user column
        col_item: name of item column
        num_user_features: num of feature column for user
        num_item_features: num of feature column for item
        only_triplet: only user-item-rating triplet pair are utilized in this system
        custom_structure

    structure of data:
        <rating>,<col_user>,<col_item>,<user_feature_col1>,...<user_feature_colm>,<item_feature_col1>,...<item_feature_coln>

    """
    #data 3 df - user,item,rating
    if data!=None:
      filetype=data.split(".")[-1]
      if filetype=="csv":
          data=pd.read_csv(data)
      elif filetype=="parquet":
          data=pd.read_parquet(data)
      elif filetype=="sql":
          info=get_db_info(filetype)
          engine = pg.connect(f"dbname='{info['db']}' user='{info['user']}' host='{info['host']}' port='{info['port']}' password='{info['password']}'")
          df = pd.read_sql(f'select * from {info[""]}', con=engine)
          data=pd.read_sql(data)
      elif filetype=="db" or filetype=="sqlite3":
          info=get_db_info(filetype)
          df = pd.read_sql(f'select * from {info["table_name"]}', con=engine)
          data=pd.read_sql(data)
    elif data==None and users!=None and items!=None and triplets!=None:
      filetype=users.split(".")[-1]
      if filetype=="csv":
          items=pd.read_csv(items)
          users=pd.read_csv(users)
          triplet=pd.read_csv(triplet)
      elif filetype=="parquet":
          items=pd.read_parquet(items)
          users=pd.read_parquet(users)
          triplet=pd.read_parquet(triplet)
      else:
        return False
      merged_df=pd.merged_df = pd.merge(items, triplet, on=col_item)

      data = pd.merge(merged_df, users, on=col_user)

    # print(data.head(10))
    if custom_structure:
        data=define_structure(data,structure)

    # Save the report as an HTML file
    # EDA(data)

    rating_col=data.columns[0]
    col_user=data.columns[1]
    col_item=data.columns[2]
    user_feature_cols=data.columns[3:num_user_features]
    item_feature_cols=data.columns[num_user_features:num_item_features]

    if missing_value!=[]:
        data= MVP(data,missing_value)
    else:
      data.dropna(inplace=True)

    data=process_datetime(data)
    if str_rating:
      lbe = LabelEncoder()
      data[rating_col] = lbe.fit_transform(data[rating_col])

    try:
        data[rating_col]=data[rating_col].astype(int)
        data[col_user]=data[col_user].astype(str)
        data[col_item]=data[col_item].astype(str)
    except Exception as e:
        raise Exception("rating:int \n col_user:str \n col_item:str")
    data_orig=data.copy(deep=True)
    modelling(data,False)
    X_train, X_test, y_train, y_test=Train_Test_Split(data_orig.drop(rating_col, axis=1),data_orig[rating_col])
    print(X_test.dtypes,data_orig.dtypes)
    prediction(data_orig.copy(deep=True),"/content/best_model.h5")
    return data_orig

In [7]:
import string

alphabet_list = list(string.ascii_uppercase)

user_feat = ["E", "G","I", "W", "Y", "Z", "AB", "AH", "AJ", "AK", "AF", "AG", "AL"]
item_feat = ["H", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "T", "U", "AA"]
import string

alphabet = list(string.ascii_uppercase)
alphabet_index = {letter: index + 1 for index, letter in enumerate(alphabet)}

def get_index(word):
    index = 0
    for i, char in enumerate(word):
        index += alphabet_index[char] * (26 ** (len(word) - i - 1))
    return index

user_feat_indices = {feat: get_index(feat) for feat in user_feat}
item_feat_indices = {feat: get_index(feat) for feat in item_feat}


In [8]:
import tensorflow as tf

In [9]:
structure={
    "user":"Client",
    "item":"ChauffeurName",
    "user_features":list(user_feat_indices.values()),
    "item_features":list(item_feat_indices.values()),
    "rating":"ChauffeurPriority"

}
data=main(data="/content/drive/MyDrive/preprocessed.csv",col_user="Client",col_item="ChauffeurName",num_user_features=14,num_item_features=13,custom_structure=True,structure=structure,str_rating=True)

ChauffeurPriority ChauffeurPriority Client ChauffeurName [5, 7, 9, 23, 25, 26, 28, 34, 36, 37, 32, 33, 38] [8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 27]
ChauffeurPriority
Client
ChauffeurName
BookingType
NoOfLegs
VehicleType
DateCompleted
TotalDistance
DurationToStartPoint
OnsiteTime
DropAddress
DropLatitude
ClientPriority
PickpupLatitude
PickupTravelClass
VehiclePriority
Port
VehicleSeatCapacity
IsBus
VehicleColour
VehicleInsuranceExpiryDate
vehicleCurrentLatitude
VehicleCurrentLongitude
VehicleRegoExpiryDate
VehicleRWCExpiryDate
VehicleGPSLocationDateTime
DriverPoliceCheckDate
NoOfBags
BookingRegion


  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(pd.to_numeric, errors='raise')


5/5 - 9s - loss: 1.9472 - mse: 1.9472
Client                        object
ChauffeurName                 object
BookingType                   object
NoOfLegs                       int64
VehicleType                   object
DateCompleted                  int64
TotalDistance                  int64
DurationToStartPoint           int64
OnsiteTime                     int64
DropAddress                   object
DropLatitude                   int64
ClientPriority                object
PickpupLatitude                int64
PickupTravelClass             object
VehiclePriority               object
Port                          object
VehicleSeatCapacity            int64
IsBus                          int64
VehicleColour                 object
VehicleInsuranceExpiryDate     int64
vehicleCurrentLatitude         int64
VehicleCurrentLongitude        int64
VehicleRegoExpiryDate          int64
VehicleRWCExpiryDate           int64
VehicleGPSLocationDateTime     int64
DriverPoliceCheckDate          int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(pd.to_numeric, errors='raise')


1.2653156094076141


In [10]:
import string

alphabet_list = list(string.ascii_uppercase)

user_feat = ["E", "G", "H", "I", "W", "Y", "Z", "AB", "AH", "AJ", "AK", "AF", "AG", "AL"]
item_feat = ["H", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "T", "U", "AA"]
import string

alphabet = list(string.ascii_uppercase)
alphabet_index = {letter: index + 1 for index, letter in enumerate(alphabet)}

def get_index(word):
    index = 0
    for i, char in enumerate(word):
        index += alphabet_index[char] * (26 ** (len(word) - i - 1))
    return index

user_feat_indices = {feat: get_index(feat) for feat in user_feat}
item_feat_indices = {feat: get_index(feat) for feat in item_feat}


In [11]:
list(user_feat_indices.values()),(list(item_feat_indices.values()))

([5, 7, 8, 9, 23, 25, 26, 28, 34, 36, 37, 32, 33, 38],
 [8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 27])

In [12]:
data

Unnamed: 0,ChauffeurPriority,Client,ChauffeurName,BookingType,NoOfLegs,VehicleType,DateCompleted,TotalDistance,DurationToStartPoint,OnsiteTime,...,VehicleColour,VehicleInsuranceExpiryDate,vehicleCurrentLatitude,VehicleCurrentLongitude,VehicleRegoExpiryDate,VehicleRWCExpiryDate,VehicleGPSLocationDateTime,DriverPoliceCheckDate,NoOfBags,BookingRegion
0,0,JETCONNECT LTD,BIBHUSAN JOSHI,Booking,0,PEOPLE MOVER (6-7 SEATS),28138645,0,0,28138564,...,Black,28375200,0,0,28372320,28372320,28184230,31645440,0,Melbourne
9,2,EMIRATES AIRWAYS - PERTH,SAM SAAD,Booking,0,PEOPLE MOVER (6-7 SEATS),28138696,0,0,28138651,...,Black,28618560,0,0,28327680,28224000,28184371,28385280,0,PERTH
26,2,EMIRATES AIRWAYS - PERTH,ROSHAN BURAH,Booking,0,SPORTS UTILITY VEHICLE,28138694,0,0,28138621,...,Black,28310400,0,0,28249920,28654560,28183700,28702080,0,PERTH
29,2,EMIRATES AIRWAYS - PERTH,AMANDA FAUNTLEROY,Booking,0,STANDARD VEHICLE,28138719,0,0,28138641,...,White,28310400,0,0,28427040,28337760,28184371,28226880,0,PERTH
30,0,EMIRATES AIRWAYS - SYDNEY,ALMAS GENA,Booking,0,SPORTS UTILITY VEHICLE,28138998,0,0,28138952,...,Silver,28604160,0,0,28599840,28599840,28184192,31645440,0,Sydney
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7129,1,AUSTRALIAN PACIFIC (INBOUND) (HL),JATINDER GILL,Booking,0,PEOPLE MOVER (6-7 SEATS),28148830,0,0,28148830,...,White,28661760,0,0,28368000,28452960,28184491,31645440,0,CAIRNS
7134,1,AUSTRALIAN PACIFIC (INBOUND) (HL),JATINDER GILL,Booking,0,PEOPLE MOVER (6-7 SEATS),28148599,0,0,28148599,...,White,28661760,0,0,28368000,28452960,28184491,31645440,0,CAIRNS
7147,2,HUGHES CREDIT CARD- CORPORATE,ROSHAN BURAH,Booking,0,SPORTS UTILITY VEHICLE,28151994,0,0,28151945,...,Black,28310400,0,0,28249920,28654560,28183700,28702080,0,PERTH
7156,2,HUGHES CREDIT CARD- CORPORATE,WAYNE CHAPMAN,Booking,0,SEDAN,28151322,0,0,28151288,...,Black,28355040,0,0,28353600,28356480,28183038,28703520,0,PERTH


In [13]:
user_feat

['E', 'G', 'H', 'I', 'W', 'Y', 'Z', 'AB', 'AH', 'AJ', 'AK', 'AF', 'AG', 'AL']

In [14]:
item_feat

['H', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'T', 'U', 'AA']