In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures


import os


notebook_path = os.path.abspath("ProjetoAA.ipynb")
print(notebook_path)


X_train = pd.read_csv("C:/Users/ptoma/Desktop/X_train.csv")



# Remover as linhas onde todos os valores dessas colunas são 0
#X_train = X_train.loc[~(X_train[['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3']] == 0).all(axis=1)]


C:\Users\ptoma\ProjetoAA\ProjetoAA.ipynb


In [84]:
X_train.shape[0]

1285000

In [86]:
n_trajectories = len(X_train) // 257  # Número total de trajetórias
trajetory_id= np.repeat(np.arange(n_trajectories), 257)

# Adicionar ao dataframe como uma nova coluna
X_train['trajetory'] = trajetory_id


In [90]:


unique_trajectories = X_train['trajetory'].unique()

# Dividir os IDs das trajetórias em conjuntos de treino e teste
train_trajectories, test_trajectories = train_test_split(unique_trajectories, test_size=0.2, random_state=0)

# Usar os IDs das trajetórias para filtrar os conjuntos de treino e teste
train_data = X_train[X_train['trajetory'].isin(train_trajectories)]
test_data = X_train[X_train['trajetory'].isin(test_trajectories)]

# Agora você pode dividir o conjunto de treino em treino e validação
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=0)



In [92]:
train_features = train_data.iloc[:-1][['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3', 't']]
train_targets = train_data.iloc[1:][['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3']]

val_features = val_data.iloc[:-1][['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3', 't']]
val_targets = val_data.iloc[1:][['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3']]

test_features = test_data.iloc[:-1][['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3', 't']]
test_targets = test_data.iloc[1:][['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3']]

In [94]:


baseline = Pipeline([
    ('scaler', StandardScaler()),  
    ('regressor', LinearRegression())  
])

baseline.fit(train_features, train_targets)

In [96]:
# 6. Fazer previsões no conjunto de validação
val_predictions = baseline.predict(val_features)

# 7. Avaliar o erro (MSE)
mse = mean_squared_error(val_targets, val_predictions)
rmse=np.sqrt(mse)
print(f" Root Mean Squared Error : {rmse}")


 Root Mean Squared Error : 1.3914721803385806


In [98]:
#Testar o baseline model no conjunto de teste

test_predictions=baseline.predict(test_features)

mse = mean_squared_error(test_targets, test_predictions)
rmse=np.sqrt(mse)
print(f" Root Mean Squared Error : {rmse}")

 Root Mean Squared Error : 1.3266669502825916


In [118]:
X_test=pd.read_csv("C:/Users/ptoma/Desktop/X_test.csv")

rename_dict = {
    'x0_1': 'x_1',
    'y0_1': 'y_1',
    'x0_2': 'x_2',
    'y0_2': 'y_2',
    'x0_3': 'x_3',
    'y0_3': 'y_3'
}

# Renomear as colunas em X_test
X_test.rename(columns=rename_dict, inplace=True)

# Verificar as novas colunas de X_test
print(X_test.columns)

Index(['Id', 't', 'x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3'], dtype='object')


In [120]:
test_features=X_test[['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3','t']]

In [128]:
print(test_features.shape)

(1041621, 7)


In [132]:
final_test_predictions = baseline.predict(test_features)  # Use as features do conjunto de teste
Y_test = pd.DataFrame(final_test_predictions, columns=['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3'])



Y_test['Id'] = X_test['Id'].values 
Y_test = Y_test[['Id', 'x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3']]


Y_test.to_csv('baseline-model.csv',index=False)

In [101]:
def validate_poly_regression(train_features,train_targets,val_features,val_targets, regressor=None, degrees=range(1, 5), max_features=None):
    best_rmse = float('inf')
    best_model = None
    best_degree = 0
    
    for degree in degrees:
        # Create a pipeline with PolynomialFeatures and the regressor
        pipeline = Pipeline([
            ('poly', PolynomialFeatures(degree=degree)),
            ('scaler', StandardScaler()),
            ('regressor', regressor if regressor else LinearRegression())
        ])
        
        # Fit the pipeline on the training data
        pipeline.fit(train_features, train_targets)
        
        # Predict on the validation data
        y_pred = pipeline.predict(val_features)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(val_targets, y_pred))
        
        # Print the number of features generated by PolynomialFeatures
        n_features = pipeline.named_steps['poly'].n_output_features_
        
        
        # Update the best model if the current one is better
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = pipeline
            best_degree = degree
    
    print(f'Best degree: {best_degree}, Best RMSE: {best_rmse}')
    return best_model, best_rmse

In [None]:
X_test=pd.read_csv("C:/Users/ptoma/Desktop/X_test.csv")

rename_dict = {
    'x0_1': 'x_1',
    'y0_1': 'y_1',
    'x0_2': 'x_2',
    'y0_2': 'y_2',
    'x0_3': 'x_3',
    'y0_3': 'y_3'
}

# Renomear as colunas em X_test
X_test.rename(columns=rename_dict, inplace=True)

# Verificar as novas colunas de X_test
print(X_test.columns)

In [None]:
test_features=X_test[['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3','t']]

In [None]:
test_predictions = baseline.predict(test_features)  # Use as features do conjunto de teste
Y_test = pd.DataFrame(test_predictions, columns=['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3'])



Y_test['Id'] = X_test['Id'].values 
Y_test = Y_test[['Id', 'x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3']]


Y_test.to_csv('baseline-model.csv',index=False)


baseline-model.head()