In [22]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import ElasticNetCV, RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score, make_scorer
from sklearn.base import TransformerMixin, RegressorMixin

In [23]:
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [24]:
df = pd.read_csv('train.csv', index_col=0)
X = df.drop(columns='source_attractiveness')
Y = df['source_attractiveness']
df.head()

Unnamed: 0,category,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,date_of_registration
6622,ecom,6488536.0,,0,82221,0,0,18.450527,0.46175,2020-04-21 16:04:41.817367072
2047,information_source,874840.0,21100.0,0,12872,0,0,10.721619,-0.022317,2024-07-19 23:50:07.268931816
1118,information_source,571210.0,94707.0,0,0,7420,0,1.922243,0.046396,2024-07-13 16:35:54.794883135
4992,news,89534.0,924.0,0,834,0,0,2.149243,-0.09336,2024-09-10 21:29:14.006315095
9970,information_source,1043953.0,289288.0,0,58375,20260,3948,3.764965,0.027303,2024-05-26 11:07:15.950527838


In [25]:
class BaseDataPreprocessor(TransformerMixin):
    def __init__(self):
        self.likes_mean = dict()
        self.clicks_mean = dict()
        self.complaints_mean = dict() 
        self.dwelltime_mean = dict()
        self.buys_mean = dict()
        self.coef = dict()

    @staticmethod
    def fix_complaints_count(df):
        df.loc[df['complaints_count'] == '-', 'complaints_count'] = float('nan')
        df['complaints_count'] = df['complaints_count'].astype(float)
        return df

    @staticmethod
    def fix_average_dwelltime(df):
        df.loc[df['average_dwelltime'] < 0, 'average_dwelltime'] = float('nan')
        return df
    
    @staticmethod
    def fill_na(df, feature, mean_by_category):
        df_na = df[feature].isna()
        df[feature] = df.apply(lambda row: row[feature] if pd.notnull(row[feature]) else mean_by_category[row['category']], axis=1)
        return df
    
    def fix_clicks(self, df):
        def fix(row):
            s = row['likes'] + row['complaints_count'] + row['buys']
            if pd.isnull(row['clicks']):
                return s * self.coef[row['category']]
            else:
                return row['clicks']
            
        df['clicks'] = df.apply(fix, axis=1)
        return df

    @staticmethod
    def cast_date_of_registration(df):
        df['date_of_registration'] = pd.to_datetime(df['date_of_registration'])
        return df

    @staticmethod
    def add_age(df):
        last_date = pd.Timestamp('2024-09-30')
        df['age'] = (last_date - df['date_of_registration']) / pd.Timedelta(days=365)
        return df.drop(columns='date_of_registration')
        
    @staticmethod
    def cast_errors_and_buys(df):
        df['4xx_errors'] = df['4xx_errors'].astype(float)
        df['5xx_errors'] = df['5xx_errors'].astype(float)
        df['buys'] = df['buys'].astype(float)
        return df
    
    def fit(self, data, *args):
        data = self.fix_complaints_count(data)
        data = self.fix_average_dwelltime(data)
        self.likes_mean = dict(data[['likes', 'category']].groupby('category').mean()['likes'])
        self.clicks_mean = dict(data[['clicks', 'category']].groupby('category').mean()['clicks'])
        self.complaints_mean = dict(data[['complaints_count', 'category']].groupby('category').mean()['complaints_count'])
        self.dwelltime_mean = dict(data[['average_dwelltime', 'category']].groupby('category').mean()['average_dwelltime'])
        self.buys_mean = dict(data[['buys', 'category']].groupby('category').mean()['buys'])

        for key, value in self.clicks_mean.items():
            self.coef[key] = self.clicks_mean[key] / (self.likes_mean[key] + self.complaints_mean[key] + self.buys_mean[key])
        
        return self

    def transform(self, data):
        data = self.fix_complaints_count(data)
        data = self.fix_average_dwelltime(data)
        data = self.cast_date_of_registration(data)
        data = self.cast_errors_and_buys(data)
        
        data = self.fill_na(data, 'likes', self.likes_mean)
        data = self.fill_na(data, 'average_dwelltime', self.dwelltime_mean)
        data = self.fill_na(data, 'complaints_count', self.complaints_mean)
        
        data = self.fix_clicks(data)
        data = self.add_age(data)
        return data
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)
    

In [26]:
preprocessor = BaseDataPreprocessor()
X = preprocessor.fit_transform(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 6622 to 9289
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   category           8000 non-null   object 
 1   clicks             8000 non-null   float64
 2   likes              8000 non-null   float64
 3   buys               8000 non-null   float64
 4   4xx_errors         8000 non-null   float64
 5   5xx_errors         8000 non-null   float64
 6   complaints_count   8000 non-null   float64
 7   average_dwelltime  8000 non-null   float64
 8   age                8000 non-null   float64
dtypes: float64(8), object(1)
memory usage: 625.0+ KB


In [27]:
class FeatureGenerator(TransformerMixin):
    def __init__(self):
        self.poly = PolynomialFeatures(2, include_bias=False)
        
    @staticmethod
    def get_per_year(df, feature):
        return df[feature] / df['age'].replace(0, 1)
    
    @staticmethod
    def get_per_click(df, feature):
        return df[feature] / df['clicks'].replace(0, 1)
    
    @staticmethod
    def get_per_dwelltime(df, feature):
        return df[feature] / df['average_dwelltime'].replace(0, 1)

    def fit(self, data, *args):
        self.poly = self.poly.fit(data)
        return self
        
    def transform(self, data):
        data_poly = self.poly.transform(data)
        cols = self.poly.get_feature_names_out(data.columns)
        data[cols] = data_poly
        
        for i in data:
            data[f'log({i})'] = np.log1p(data[i])

        for i in data:
            data[f'{i}_per_year'] = self.get_per_year(data, i)
            data[f'{i}_per_click'] = self.get_per_click(data, i)
            data[f'{i}_per_dwelltime'] = self.get_per_dwelltime(data, i)
        
        return data
        
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [7]:
feature_generator = FeatureGenerator()
cat = X['category']
X = feature_generator.fit_transform(X.drop(columns='category'))
X['category'] = cat
X.head()

Unnamed: 0,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,age,clicks^2,clicks likes,...,log(average_dwelltime^2)_per_year,log(average_dwelltime^2)_per_click,log(average_dwelltime^2)_per_dwelltime,log(average_dwelltime age)_per_year,log(average_dwelltime age)_per_click,log(average_dwelltime age)_per_dwelltime,log(age^2)_per_year,log(age^2)_per_click,log(age^2)_per_dwelltime,category
6622,6488536.0,1633087.0,0.0,82221.0,0.0,0.0,18.450527,4.44474,42101100000000.0,10596350000000.0,...,1.312365,8.989885e-07,0.316149,0.994194,6.810372e-07,0.239502,0.682339,4.674123e-07,0.164376,ecom
2047,874840.0,21100.0,0.0,12872.0,0.0,0.0,10.721619,0.197279,765345000000.0,18459120000.0,...,24.093716,5.433206e-06,0.443327,5.759747,1.29884e-06,0.10598,0.193537,4.364316e-08,0.003561,information_source
1118,571210.0,94707.0,0.0,0.0,7420.0,0.0,1.922243,0.214544,326280900000.0,54097590000.0,...,7.208335,2.707414e-06,0.80453,1.609434,6.044953e-07,0.179631,0.209752,7.878193e-08,0.023411,information_source
4992,89534.0,924.0,0.0,834.0,0.0,0.0,2.149243,0.052342,8016337000.0,82729420.0,...,32.979431,1.92798e-05,0.803165,2.036718,1.190667e-06,0.049601,0.05227,3.055712e-08,0.001273,news
9970,1043953.0,289288.0,0.0,58375.0,20260.0,3948.0,3.764965,0.346676,1089838000000.0,302003100000.0,...,7.844932,2.605143e-06,0.722356,2.409102,8.000139e-07,0.221829,0.327375,1.087146e-07,0.030144,information_source


In [8]:
class Scaler(TransformerMixin):
    def __init__(self):
        self.ecom_scaler = StandardScaler()
        self.inf_scaler = StandardScaler()
        self.news_scaler = StandardScaler()
        self.porn_scaler = StandardScaler()
        self.social_scaler = StandardScaler()
    
    def split_data(self, df):
        ecom_idx, inf_idx, news_idx, porn_idx, social_idx = self.get_indicies(df)
        ecom = df[ecom_idx].drop(['category'], axis=1)
        inf = df[inf_idx].drop(['category', 'buys'], axis=1)
        news = df[news_idx].drop(['category', 'buys'], axis=1)
        porn = df[porn_idx].drop(['category', 'buys'], axis=1)
        social = df[social_idx].drop(['category', 'buys'], axis=1)
        return ecom, inf, news, porn, social

    def get_indicies(self, df):
        ecom_idx = df['category'] == 'ecom'
        inf_idx = df['category'] == 'information_source'
        news_idx = df['category'] == 'news'
        porn_idx = df['category'] == 'porn'
        social_idx = df['category'] == 'social'
        return ecom_idx, inf_idx, news_idx, porn_idx, social_idx
    
    def fit(self, data, *args):
        ecom, inf, news, porn, social = self.split_data(data)
        self.ecom_scaler = self.ecom_scaler.fit(ecom)
        self.inf_scaler = self.inf_scaler.fit(inf)
        self.news_scaler = self.news_scaler.fit(news)
        self.porn_scaler = self.porn_scaler.fit(porn)
        self.social_scaler = self.social_scaler.fit(social)
        return self

    def transform(self, data):
        ecom_idx, inf_idx, news_idx, porn_idx, social_idx = self.get_indicies(X)
        
        data.loc[ecom_idx, self.ecom_scaler.feature_names_in_] = self.ecom_scaler.transform(data.loc[ecom_idx, self.ecom_scaler.feature_names_in_])
        data.loc[inf_idx, self.inf_scaler.feature_names_in_] = self.inf_scaler.transform(data.loc[inf_idx, self.inf_scaler.feature_names_in_])
        data.loc[news_idx, self.news_scaler.feature_names_in_] = self.news_scaler.transform(data.loc[news_idx, self.news_scaler.feature_names_in_])
        data.loc[porn_idx, self.porn_scaler.feature_names_in_] = self.porn_scaler.transform(data.loc[porn_idx, self.porn_scaler.feature_names_in_])
        data.loc[social_idx, self.social_scaler.feature_names_in_] = self.social_scaler.transform(data.loc[social_idx, self.social_scaler.feature_names_in_])

        return data
        
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [30]:
scaler = Scaler()
X = scaler.fit_transform(X)
X.describe()
X = scaler.transform(X)

In [10]:
class DomainRegression(RegressorMixin):
    def __init__(self, *args, **kwargs):
        self.ecom_model = ElasticNetCV(*args, **kwargs)
        self.inf_model = ElasticNetCV(*args, **kwargs)
        self.news_model = ElasticNetCV(*args, **kwargs)
        self.porn_model = ElasticNetCV(*args, **kwargs)
        self.social_model = ElasticNetCV(*args, **kwargs)

    def split_data(self, df):
        ecom_idx, inf_idx, news_idx, porn_idx, social_idx = self.get_indicies(df)
        ecom = df[ecom_idx].drop(['category'], axis=1)
        inf = df[inf_idx].drop(['category'], axis=1)
        news = df[news_idx].drop(['category'], axis=1)
        porn = df[porn_idx].drop(['category'], axis=1)
        social = df[social_idx].drop(['category'], axis=1)
        return ecom, inf, news, porn, social

    def get_indicies(self, df):
        ecom_idx = df['category'] == 'ecom'
        inf_idx = df['category'] == 'information_source'
        news_idx = df['category'] == 'news'
        porn_idx = df['category'] == 'porn'
        social_idx = df['category'] == 'social'
        return ecom_idx, inf_idx, news_idx, porn_idx, social_idx
    
    def fit(self, X, Y):
        df_Y = pd.DataFrame(Y)
        df_Y['category'] = X['category']
        ecom_X, inf_X, news_X, porn_X, social_X = self.split_data(X)
        ecom_Y, inf_Y, news_Y, porn_Y, social_Y = self.split_data(df_Y)

        self.ecom_model = self.ecom_model.fit(ecom_X, ecom_Y['source_attractiveness'])
        self.inf_model = self.inf_model.fit(inf_X.drop(['buys'], axis=1), inf_Y['source_attractiveness'])
        self.news_model = self.news_model.fit(news_X.drop(['buys'], axis=1), news_Y['source_attractiveness'])
        self.porn_model = self.porn_model.fit(porn_X.drop(['buys'], axis=1), porn_Y['source_attractiveness'])
        self.social_model = self.social_model.fit(social_X.drop(['buys'], axis=1), social_Y['source_attractiveness'])
        
        return self

    def predict(self, X):
        ecom_idx, inf_idx, news_idx, porn_idx, social_idx = self.get_indicies(X)

        pred = np.zeros((X.shape[0]), dtype=float)        

        pred[ecom_idx] = self.ecom_model.predict(X.loc[ecom_idx, self.ecom_model.feature_names_in_])
        pred[inf_idx] = self.inf_model.predict(X.loc[inf_idx, self.inf_model.feature_names_in_])
        pred[news_idx] = self.news_model.predict(X.loc[news_idx, self.news_model.feature_names_in_])
        pred[porn_idx] = self.porn_model.predict(X.loc[porn_idx, self.porn_model.feature_names_in_])
        pred[social_idx] = self.social_model.predict(X.loc[social_idx, self.social_model.feature_names_in_])

        return pred

    def get_params(self, deep=True):
        return self.ecom_model.get_params(deep)

    def set_params(self, **params):
        self.ecom_model.set_params(**params)
        self.inf_model.set_params(**params)
        self.news_model.set_params(**params)
        self.porn_model.set_params(**params)
        self.social_model.set_params(**params)
        return self

    def __repr__(self):
        return f"DomainRegression({self.ecom_model.get_params()})"

In [11]:
alphas = np.logspace(-1, 5)
l1_ratio = np.logspace(-1, 0)

In [12]:
model = DomainRegression(l1_ratio=l1_ratio, alphas=alphas)

In [13]:
X.shape

(8000, 353)

In [14]:
Y

6622    0.461750
2047   -0.022317
1118    0.046396
4992   -0.093360
9970    0.027303
          ...   
361     0.017470
2621   -0.019069
2605   -0.061523
1231    0.131620
9289   -0.000510
Name: source_attractiveness, Length: 8000, dtype: float64

In [15]:
scores = cross_val_score(
    estimator=model, # Модель, качество которой замеряем
    X=X, # Данные для обучения
    y=Y, # Таргет
    cv=5, # Количество фолдов
    scoring='neg_mean_squared_error', # Метрика
    n_jobs=-1 # Количетсво ядре для вычислений, -1 - все доступные ядра
)

In [16]:
scores

array([-0.00299309, -0.0036256 , -0.00348061, -0.00319169, -0.00397162])

In [17]:
scores.mean()

-0.0034525214529876067

In [19]:
pred = model.predict(X)

AttributeError: 'ElasticNetCV' object has no attribute 'feature_names_in_'