In [5]:
import os
from dotenv import load_dotenv

# Загрузка переменных окружения из файла .params.env
load_dotenv(dotenv_path='envs/.params.env')

# Чтение констант
train_data_url = os.getenv('TRAIN_DATA_URL')
test_data_url = os.getenv('TEST_DATA_URL')
test_size = float(os.getenv('TEST_SIZE', 0.2))  
epochs = int(os.getenv('EPOCHS', 10))  
model_param_1 = float(os.getenv('MODEL_PARAMETR_1', 0.001))

print(f"Train Data URL: {train_data_url}")
print(f"Test Data URL: {test_data_url}")
print(f"Test Size: {test_size}")
print(f"Epochs: {epochs}")
print(f"Model Parameter 1: {model_param_1}")


Train Data URL: https://storage.googleapis.com/tf-datasets/titanic/train.csv
Test Data URL: https://storage.googleapis.com/tf-datasets/titanic/eval.csv
Test Size: 0.2
Epochs: 10
Model Parameter 1: 0.001


# Домашнее задание

В этом задании необходимо решить задачу классификации и регрессии с помощью фреймоврка `pytorch`

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Пример предобработки данных
data = pd.read_csv('dataset.csv')
data.fillna(0, inplace=True)
print("Данные успешно загружены и обработаны.")

In [1]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [3]:
import pandas as pd
from torch import nn
import torch
from torch.utils.data import TensorDataset, DataLoader


## Классификация

Для задачи классификации будет использоваться набор [космического титаника](https://drive.google.com/file/d/143k5oFdfVRDpke9GCnfzTcQNA9jIS0O-/view?usp=sharing). Описание датасета на [kaggle](https://www.kaggle.com/competitions/spaceship-titanic/data?select=train.csv)

1) Импорт и обработка данных

- обработкайте пропуски, 

- масштабирование данных,

- обработка категориальных признаков

In [66]:
df = pd.read_csv('spacesheep-titanic.csv')
df.head

<bound method NDFrame.head of      PassengerId HomePlanet CryoSleep     Cabin    Destination   Age    VIP  \
0        0001_01     Europa     False     B/0/P    TRAPPIST-1e  39.0  False   
1        0002_01      Earth     False     F/0/S    TRAPPIST-1e  24.0  False   
2        0003_01     Europa     False     A/0/S    TRAPPIST-1e  58.0   True   
3        0003_02     Europa     False     A/0/S    TRAPPIST-1e  33.0  False   
4        0004_01      Earth     False     F/1/S    TRAPPIST-1e  16.0  False   
...          ...        ...       ...       ...            ...   ...    ...   
8688     9276_01     Europa     False    A/98/P    55 Cancri e  41.0   True   
8689     9278_01      Earth      True  G/1499/S  PSO J318.5-22  18.0  False   
8690     9279_01      Earth     False  G/1500/S    TRAPPIST-1e  26.0  False   
8691     9280_01     Europa     False   E/608/S    55 Cancri e  32.0  False   
8692     9280_02     Europa     False   E/608/S    TRAPPIST-1e  44.0  False   

      RoomService  Fo

In [67]:
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [68]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['CryoSleep'].fillna(df['CryoSleep'].mode()[0], inplace=True)
df['HomePlanet'].fillna(df['HomePlanet'].mode()[0], inplace=True)
df['FoodCourt'].interpolate(method='linear', inplace=True)
df['VRDeck'].interpolate(method='polynomial', order=2, inplace=True)
df.drop(['Cabin'], axis=1, inplace=True)
df.dropna(axis=0, inplace=True)

In [69]:
df[:5]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [71]:
# масштабирование данных
numeric_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']  # Числовые столбцы
for column in numeric_columns:
    df[column] = (df[column] - df[column].mean()) / df[column].std()

In [72]:
df[:10]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,TRAPPIST-1e,0.703529,False,-0.334438,-0.287815,-0.310955,-0.271765,-0.266596,Maham Ofracculy,False
1,0002_01,Earth,False,TRAPPIST-1e,-0.341356,False,-0.169474,-0.282264,-0.266391,0.205914,-0.227261,Juanna Vines,True
2,0003_01,Europa,False,TRAPPIST-1e,2.02705,True,-0.269361,1.917867,-0.310955,5.570881,-0.222791,Altark Susent,False
3,0003_02,Europa,False,TRAPPIST-1e,0.285575,False,-0.334438,0.503541,0.350383,2.624761,-0.09406,Solam Susent,False
4,0004_01,Earth,False,TRAPPIST-1e,-0.898628,False,0.124131,-0.244639,-0.041785,0.219835,-0.264808,Willy Santantines,True
5,0005_01,Earth,False,PSO J318.5-22,1.051824,False,-0.334438,0.0101,-0.310955,-0.018569,-0.266596,Sandie Hinetthews,True
6,0006_01,Earth,False,TRAPPIST-1e,-0.202038,False,-0.270874,0.661442,-0.305608,-0.271765,-0.266596,Billex Jacostaffey,True
7,0006_02,Earth,True,TRAPPIST-1e,-0.06272,False,-0.334438,-0.287815,-0.310955,-0.271765,-0.26846,Candra Jacostaffey,True
8,0007_01,Earth,False,TRAPPIST-1e,0.424893,False,-0.334438,0.196374,-0.280652,-0.083826,-0.266596,Andona Beston,True
9,0008_01,Europa,True,55 Cancri e,-1.037946,False,-0.334438,-0.287815,-0.310955,-0.271765,-0.266596,Erraiam Flatic,True


In [73]:
df = pd.get_dummies(df, columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP'], drop_first=True)
df[:5]

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
0,0001_01,0.703529,-0.334438,-0.287815,-0.310955,-0.271765,-0.266596,Maham Ofracculy,False,True,False,False,False,True,False
1,0002_01,-0.341356,-0.169474,-0.282264,-0.266391,0.205914,-0.227261,Juanna Vines,True,False,False,False,False,True,False
2,0003_01,2.02705,-0.269361,1.917867,-0.310955,5.570881,-0.222791,Altark Susent,False,True,False,False,False,True,True
3,0003_02,0.285575,-0.334438,0.503541,0.350383,2.624761,-0.09406,Solam Susent,False,True,False,False,False,True,False
4,0004_01,-0.898628,0.124131,-0.244639,-0.041785,0.219835,-0.264808,Willy Santantines,True,False,False,False,False,True,False


2) Разделение данных на train и test

In [74]:
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

In [75]:
train_df.shape[0], test_df.shape[0]

(6069, 1517)

In [76]:
train_df[:5]

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
886,0951_01,0.63387,-0.334438,-0.287815,-0.310955,-0.271765,-0.266596,Done Lunapperts,True,False,False,True,True,False,False
735,0772_01,0.564211,-0.334438,-0.277946,0.268384,3.622753,0.948307,Minoton Hednigic,False,True,False,False,False,False,False
1013,1083_01,-0.132379,-0.334438,-0.287815,-0.310955,-0.271765,-0.266596,Ste Sfin,True,False,True,True,False,True,False
4998,5341_01,-0.619992,-0.334438,-0.287815,-0.310955,-0.271765,-0.266596,Jessa Greenez,True,False,False,True,False,True,False
6395,6756_02,1.191142,-0.334438,-0.287815,1.013505,-0.268284,-0.261232,Monah Pittson,True,False,False,False,True,False,False


In [77]:

X = train_df.drop(['Transported', 'Name', 'PassengerId'], axis=1, errors='ignore').astype(float)
X_np = X.to_numpy()
X_train = torch.tensor(X_np, dtype=torch.float)
y = train_df['Transported'].astype(float)  # Преобразование в float для совместимости
y_np = y.to_numpy()
y_train = torch.tensor(y_np, dtype=torch.float).view(-1, 1)  # Добавляем размерность для совместимости с BCELoss

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)



In [78]:
train_df[:5]

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
886,0951_01,0.63387,-0.334438,-0.287815,-0.310955,-0.271765,-0.266596,Done Lunapperts,True,False,False,True,True,False,False
735,0772_01,0.564211,-0.334438,-0.277946,0.268384,3.622753,0.948307,Minoton Hednigic,False,True,False,False,False,False,False
1013,1083_01,-0.132379,-0.334438,-0.287815,-0.310955,-0.271765,-0.266596,Ste Sfin,True,False,True,True,False,True,False
4998,5341_01,-0.619992,-0.334438,-0.287815,-0.310955,-0.271765,-0.266596,Jessa Greenez,True,False,False,True,False,True,False
6395,6756_02,1.191142,-0.334438,-0.287815,1.013505,-0.268284,-0.261232,Monah Pittson,True,False,False,False,True,False,False


3) Определите модель torch, для этого определите количество входов и выходов

In [79]:
N_INPUT = X.shape[1]
N_OUTPUT = 1
N_INPUT, N_OUTPUT

(12, 1)

Определите модель с 2 скрытыми слоями 

In [102]:
N_HIDDEN1 = 26 
N_HIDDEN2 = 13

model = nn.Sequential(
    nn.Linear(N_INPUT, N_HIDDEN1),
    nn.ReLU(),
    nn.Linear(N_HIDDEN1, N_HIDDEN2),
    nn.ReLU(),
    nn.Linear(N_HIDDEN2, N_OUTPUT),
    nn.Sigmoid()
)

Определите функцию потерь и оптимизатор

In [103]:
criterion = nn.BCELoss()  # Функция потерь
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Оптимизатор Adam

4) Обучение модели - реализуйте цикл обучения

In [99]:
epochs = 20
batch_size = 64

In [86]:
def lern(epochs, train_loader, optimizer, criterion):
    for epoch in range(epochs):
        epoch_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            if (i + 1) % 100 == 0:
                print(f'Эпоха [{epoch + 1}/{epochs}], Шаг [{i + 1}/{len(train_loader)}], Loss: {epoch_loss / (i + 1)}')
        print(f'Эпоха [{epoch + 1}/{epochs}], Loss: {epoch_loss / len(train_loader)}')

lern(epochs, train_loader, optimizer, criterion)

Эпоха [1/20], Loss: 0.4266681887601551
Эпоха [2/20], Loss: 0.42699626433221916
Эпоха [3/20], Loss: 0.42628156323181954
Эпоха [4/20], Loss: 0.42619096542659557
Эпоха [5/20], Loss: 0.4258824558634507
Эпоха [6/20], Loss: 0.42528022496323836
Эпоха [7/20], Loss: 0.42516844868659975
Эпоха [8/20], Loss: 0.42411317041045743
Эпоха [9/20], Loss: 0.42401725618462816
Эпоха [10/20], Loss: 0.42386056247510406
Эпоха [11/20], Loss: 0.42337316055046886
Эпоха [12/20], Loss: 0.4229733250643078
Эпоха [13/20], Loss: 0.42367395539032787
Эпоха [14/20], Loss: 0.42305878149835685
Эпоха [15/20], Loss: 0.4228029793814609
Эпоха [16/20], Loss: 0.42262586888514064
Эпоха [17/20], Loss: 0.4231581098154972
Эпоха [18/20], Loss: 0.42252271300867983
Эпоха [19/20], Loss: 0.4218062294156928
Эпоха [20/20], Loss: 0.4219575540015572


5) Выведите метрики модели

In [88]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


X_test = test_df.drop(['Transported', 'Name', 'PassengerId'], axis=1, errors='ignore').astype(float)
y_test = test_df['Transported'].astype(float)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float).view(-1, 1)

def metrics(model, X_test_tensor, y_test_tensor):
    model.eval() 
    with torch.no_grad():
        predictions = model(X_test_tensor).round()  # округление до 0 или 1

    
    predictions_np = predictions.numpy()
    y_test_np = y_test_tensor.numpy()

    accuracy = accuracy_score(y_test_np, predictions_np)
    precision = precision_score(y_test_np, predictions_np)
    recall = recall_score(y_test_np, predictions_np)
    f1 = f1_score(y_test_np, predictions_np)

    print(accuracy)
    print(precision)
    print(recall)
    print(f1)
    
metrics(model, X_test_tensor, y_test_tensor)

0.8094924192485168
0.7917647058823529
0.8573248407643312
0.8232415902140673


6) Поварьируйте параметры модели, оптимизатор - посмотрите как параметры влияют на метрики и скорость сходимости

In [90]:
# увеличение количества нейронов
N_HIDDEN1 = 52  
N_HIDDEN2 = 26  

model = nn.Sequential(
    nn.Linear(N_INPUT, N_HIDDEN1),
    nn.ReLU(),
    nn.Linear(N_HIDDEN1, N_HIDDEN2),
    nn.ReLU(),
    nn.Linear(N_HIDDEN2, N_OUTPUT),
    nn.Sigmoid()
)

In [91]:
lern(epochs, train_loader, optimizer, criterion)
metrics(model, X_test_tensor, y_test_tensor)

Эпоха [1/20], Loss: 0.6963554476436816
Эпоха [2/20], Loss: 0.6963628674808301
Эпоха [3/20], Loss: 0.6963634516063489
Эпоха [4/20], Loss: 0.6963582396507263
Эпоха [5/20], Loss: 0.696366044094688
Эпоха [6/20], Loss: 0.6963600491222582
Эпоха [7/20], Loss: 0.6963643670082093
Эпоха [8/20], Loss: 0.6963586418252242
Эпоха [9/20], Loss: 0.6963605523109436
Эпоха [10/20], Loss: 0.6963531531785664
Эпоха [11/20], Loss: 0.6963620938752827
Эпоха [12/20], Loss: 0.6963655798058761
Эпоха [13/20], Loss: 0.6963595415416517
Эпоха [14/20], Loss: 0.6963695187317698
Эпоха [15/20], Loss: 0.6963736653327942
Эпоха [16/20], Loss: 0.6963601614299574
Эпоха [17/20], Loss: 0.6963493598134894
Эпоха [18/20], Loss: 0.6963693813273781
Эпоха [19/20], Loss: 0.6963531418850547
Эпоха [20/20], Loss: 0.69637337672083
0.5003295978905735
0.5906040268456376
0.11210191082802548
0.18843683083511778


In [95]:
# Изменение оптимизатора
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)  # Использование SGD вместо Adam


model = nn.Sequential(
    nn.Linear(N_INPUT, N_HIDDEN1),
    nn.ReLU(),
    nn.Linear(N_HIDDEN1, N_HIDDEN2),
    nn.ReLU(),
    nn.Linear(N_HIDDEN2, N_OUTPUT),
    nn.Sigmoid()
)
lern(epochs, train_loader, optimizer, criterion)
metrics(model, X_test_tensor, y_test_tensor)

Эпоха [1/20], Loss: 0.6926744630462245
Эпоха [2/20], Loss: 0.6926688238194114
Эпоха [3/20], Loss: 0.6926511814719752
Эпоха [4/20], Loss: 0.6926671184991535
Эпоха [5/20], Loss: 0.6926696325603284
Эпоха [6/20], Loss: 0.6926670494832491
Эпоха [7/20], Loss: 0.6926636495088276
Эпоха [8/20], Loss: 0.6926711527924788
Эпоха [9/20], Loss: 0.6926607960148862
Эпоха [10/20], Loss: 0.6926585442141483
Эпоха [11/20], Loss: 0.6926619529724121
Эпоха [12/20], Loss: 0.692669153213501
Эпоха [13/20], Loss: 0.692661400217759
Эпоха [14/20], Loss: 0.6926653642403452
Эпоха [15/20], Loss: 0.6926671818683022
Эпоха [16/20], Loss: 0.6926713341160824
Эпоха [17/20], Loss: 0.6926812774256657
Эпоха [18/20], Loss: 0.6926646577684503
Эпоха [19/20], Loss: 0.692673302951612
Эпоха [20/20], Loss: 0.6926563476261339
0.5293342122610415
0.5237140948563794
0.9987261146496815
0.6871165644171779


In [101]:
# Изменение размера батча

batch_size = 128  # Увеличение размера батча

# Создание нового DataLoader с измененным размером батча
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

In [104]:
lern(epochs, train_loader, optimizer, criterion)
metrics(model, X_test_tensor, y_test_tensor)

Эпоха [1/20], Loss: 0.6779834541181723
Эпоха [2/20], Loss: 0.6137389031549295
Эпоха [3/20], Loss: 0.5094292772312959
Эпоха [4/20], Loss: 0.45728949705759686
Эпоха [5/20], Loss: 0.4444992144902547
Эпоха [6/20], Loss: 0.4405236920962731
Эпоха [7/20], Loss: 0.4389571485420068
Эпоха [8/20], Loss: 0.4367892052978277
Эпоха [9/20], Loss: 0.43582196657856304
Эпоха [10/20], Loss: 0.4345147634545962
Эпоха [11/20], Loss: 0.4338782435903947
Эпоха [12/20], Loss: 0.434299632286032
Эпоха [13/20], Loss: 0.4327590937415759
Эпоха [14/20], Loss: 0.4321047042806943
Эпоха [15/20], Loss: 0.4315802411486705
Эпоха [16/20], Loss: 0.42975501095255214
Эпоха [17/20], Loss: 0.43189873546361923
Эпоха [18/20], Loss: 0.4307515745361646
Эпоха [19/20], Loss: 0.4284814999749263
Эпоха [20/20], Loss: 0.4292016625404358
0.8068556361239289
0.7907801418439716
0.8522292993630574
0.8203556100551809


## Регрессия

Для задачи прогнозирования будет использоваться набор [прогноза аренды жилья](https://drive.google.com/file/d/1CH6FKsiw_k1nt_CJm5jFMUWsD1p0G3Km/view?usp=sharing). Описание датасета на [kaggle](https://www.kaggle.com/datasets/iamsouravbanerjee/house-rent-prediction-dataset?select=House_Rent_Dataset.csv)

1) Импорт и обработка данных

- обработкайте пропуски, 

- масштабирование данных,

- обработка категориальных признаков

In [170]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing

In [171]:
data = pd.read_csv('House_Rent_Dataset.csv')

In [172]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BHK,4746.0,2.08386,0.832256,1.0,2.0,2.0,3.0,6.0
Rent,4746.0,34993.451327,78106.412937,1200.0,10000.0,16000.0,33000.0,3500000.0
Size,4746.0,967.490729,634.202328,10.0,550.0,850.0,1200.0,8000.0
Bathroom,4746.0,1.965866,0.884532,1.0,1.0,2.0,2.0,10.0


In [173]:
data["Floor Number"]=data["Floor"].apply(lambda x:str(x).split()[0])
data["Total Floor"]=data["Floor"].apply(lambda x:str(x).split()[-1])
del data["Floor"]
data["Floor Number"] = data["Floor Number"].replace(['Ground'],0)
data["Floor Number"] = data["Floor Number"].replace(['Lower'],-2)
data["Floor Number"] = data["Floor Number"].replace(['Upper'],-1)
data["Floor Number"] = data["Floor Number"].replace(['Ground'],0)
data["Total Floor"] = data["Total Floor"].replace(['Ground'],1)
data.sample(5)


Unnamed: 0,Posted On,BHK,Rent,Size,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Floor Number,Total Floor
870,2022-07-07,2,45000,710,Carpet Area,Saki Vihar Road,Mumbai,Unfurnished,Bachelors/Family,2,Contact Agent,8,16
35,2022-05-12,2,15000,850,Carpet Area,Sreebhumi,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,2
2454,2022-05-27,2,9000,145,Super Area,Palam,Delhi,Unfurnished,Bachelors/Family,2,Contact Owner,1,3
250,2022-06-19,2,6000,600,Carpet Area,Santoshpur,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,0,3
2689,2022-05-27,2,13000,400,Carpet Area,Rohini Sector 24,Delhi,Semi-Furnished,Bachelors/Family,2,Contact Agent,0,4


In [174]:
data["Total Floor"].value_counts()



Total Floor
4     938
3     915
2     868
5     422
1     335
     ... 
39      1
62      1
59      1
66      1
1       1
Name: count, Length: 67, dtype: int64

In [175]:
label_encoder = preprocessing.LabelEncoder()
data["Area Type"]= label_encoder.fit_transform(data["Area Type"])
data["Area Type"].unique()

array([2, 1, 0])

In [176]:
data["Furnishing Status"]= label_encoder.fit_transform(data["Furnishing Status"])
data["Furnishing Status"].unique()

array([2, 1, 0])

In [178]:
data["Tenant Preferred"]= label_encoder.fit_transform(data["Tenant Preferred"])
data["Tenant Preferred"].unique()

array([1, 0, 2])

In [179]:
data["Point of Contact"]= label_encoder.fit_transform(data["Point of Contact"])
data["Point of Contact"].unique()

array([2, 0, 1])

In [182]:
data.head()

Unnamed: 0,BHK,Rent,Size,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Floor Number,Total Floor
0,2,10000,1100,2,Kolkata,2,1,2,2,0,2
1,2,20000,800,2,Kolkata,1,1,1,2,1,3
2,2,17000,1000,2,Kolkata,1,1,1,2,1,3
3,2,10000,800,2,Kolkata,2,1,1,2,1,2
4,2,7500,850,1,Kolkata,2,0,1,2,1,2


In [181]:
data.drop(["Posted On"], axis="columns", inplace=True)
data.drop(["Area Locality"], axis="columns", inplace=True)

In [183]:
data['City']= label_encoder.fit_transform(data['City'])
data['City'].unique()

array([4, 5, 0, 2, 1, 3])

In [185]:
from sklearn.preprocessing import minmax_scale

data["Rent"] = minmax_scale(data["Rent"])
data["Size"] = minmax_scale(data["Size"])

2) Разделение данных на train и test

In [186]:
from sklearn.model_selection import train_test_split

X = data.drop(['Rent'], axis=1)  # Удаляем целевую переменную
y = data['Rent']  # Основная переменная

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 20% данных на тестовую выборку

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3796, 10), (950, 10), (3796,), (950,))

In [187]:
from sklearn.preprocessing import StandardScaler

In [205]:
ss=StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [206]:
X_train_torch = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_torch = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)  # Делаем y двумерным тензором
X_test_torch = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_torch = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

3) Определите модель torch, для этого определите количество входов и выходов

In [199]:
N_INPUT = X.shape[1]
N_OUTPUT = 1 
N_INPUT, N_OUTPUT

(10, 1)

In [200]:
N_HIDDEN_1 = 64
N_HIDDEN_2 = 32

Определите модель с 2 скрытыми слоями

In [201]:
model = nn.Sequential(
    nn.Linear(N_INPUT, N_HIDDEN_1),  # Первый слой (входной)
    nn.ReLU(),  # Функция активации для первого скрытого слоя
    nn.Linear(N_HIDDEN_1, N_HIDDEN_2),  # Второй слой (скрытый)
    nn.ReLU(),  # Функция активации для второго скрытого слоя
    nn.Linear(N_HIDDEN_2, N_OUTPUT)  # Третий слой (выходной)
)

In [202]:
print(model)

Sequential(
  (0): Linear(in_features=10, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=1, bias=True)
)


Определите функцию потерь и оптимизатор

In [203]:
import torch.optim as optim

In [204]:
loss_function = nn.MSELoss()  
optimizer = optim.Adam(model.parameters(), lr=0.001) 


4) Обучение модели - реализуйте цикл обучения

In [207]:
train_data = TensorDataset(X_train_torch, y_train_torch)
test_data = TensorDataset(X_test_torch, y_test_torch)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [211]:
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)

    print(f'Эпоха {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

Эпоха 1/20, Loss: 0.0000
Эпоха 2/20, Loss: 0.0000
Эпоха 3/20, Loss: 0.0000
Эпоха 4/20, Loss: 0.0000
Эпоха 5/20, Loss: 0.0000
Эпоха 6/20, Loss: 0.0000
Эпоха 7/20, Loss: 0.0000
Эпоха 8/20, Loss: 0.0001
Эпоха 9/20, Loss: 0.0003
Эпоха 10/20, Loss: 0.0000
Эпоха 11/20, Loss: 0.0001
Эпоха 12/20, Loss: 0.0001
Эпоха 13/20, Loss: 0.0001
Эпоха 14/20, Loss: 0.0000
Эпоха 15/20, Loss: 0.0000
Эпоха 16/20, Loss: 0.0000
Эпоха 17/20, Loss: 0.0000
Эпоха 18/20, Loss: 0.0000
Эпоха 19/20, Loss: 0.0000
Эпоха 20/20, Loss: 0.0000


5) Выведите метрики модели

In [213]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(loader):
    model.eval()
    actuals = []
    predictions = []
    with torch.no_grad():
        for inputs, targets in loader:
            outputs = model(inputs)
            actuals.extend(targets.view(-1).tolist())
            predictions.extend(outputs.view(-1).tolist())
    
    mse = mean_squared_error(actuals, predictions)
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)
    return mse, mae, r2

# Оценка модели на обучающей выборке
train_mse, train_mae, train_r2 = evaluate_model(train_loader)
print(f'Train MSE: {train_mse:.4f}, Train MAE: {train_mae:.4f}, Train R^2: {train_r2:.4f}')

# Оценка модели на тестовой выборке
test_mse, test_mae, test_r2 = evaluate_model(test_loader)
print(f'Test MSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}, Test R^2: {test_r2:.4f}')
print(f'R^2: {r2}')


Train MSE: 0.0000, Train MAE: 0.0023, Train R^2: 0.9629
Test MSE: 0.0002, Test MAE: 0.0041, Test R^2: 0.4370
R^2: 0.23693069638203434
