In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

file_path = r"C:\Users\safwan\Downloads\archive(1)\IOT-temp.csv"
df = pd.read_csv(file_path, encoding='utf-8')

def fix_date_format(string0):
    string0 = str(string0).strip()
    
    
    if string0 == '' or string0 == 'nan' or string0.lower() == 'none':
        return None
    
    try:
        
        if '-' in string0:
            parts = string0.split('-')
            if len(parts) == 3:
                day, month, year_time = parts
                
                if ' ' in year_time:
                    year, time_part = year_time.split(' ', 1)
                else:
                    year = year_time
                    time_part = ''
                
                if int(day) ==0:
                    return None
                else:
                    
                    day = day.zfill(2)
                if int(month) == 0:
                    return None
                else:
                    month = month.zfill(2)
                
                if time_part:
                    return f"{month}-{day}-{year} {time_part}"
                else:
                    return f"{month}-{day}-{year}"
        return string0  
    except:
        return None  

def is_valid_date(string0):
    string0 = str(string0).strip()
    
   
    if string0 == '' or string0 == 'nan' or string0.lower() == 'none':
        return False
    
    try:
        
        parts = string0.split('-', 2)
        if len(parts) < 3:
            return False
            
        day, month, rest = parts
        
  
        if ' ' in rest:
            year_part = rest.split(' ')[0]
        else:
            year_part = rest
            
        
        try:
            day = int(day)
            month = int(month)
            year = int(year_part)
        except ValueError:
            return False
        
        
        if day == 0 or month == 0 or year == 0:
            return False
        
        if month < 1 or month > 12:
            return False
        
        if day < 1 or day > 31:
            return False
            
        return True
    except:
        return False


print(f"Original dataset shape: {df.shape}")


df = df.dropna(subset=['noted_date'])
print(f"After dropping NaN in noted_date: {df.shape}")


df['noted_date'] = df['noted_date'].apply(fix_date_format)

df = df.dropna(subset=['noted_date'])
print(f"After fixing date formats: {df.shape}")


valid_date_mask = df['noted_date'].apply(is_valid_date)
df = df[valid_date_mask]
print(f"After removing invalid dates: {df.shape}")


df['noted_date'] = pd.to_datetime(df['noted_date'], errors='coerce')


df = df.dropna(subset=['noted_date'])
print(f"After final datetime conversion: {df.shape}")


df['year'] = df['noted_date'].dt.year
df['month'] = df['noted_date'].dt.month
df['day'] = df['noted_date'].dt.day
df['day_of_week'] = df['noted_date'].dt.dayofweek
df['hour'] = df['noted_date'].dt.hour
df['minute'] = df['noted_date'].dt.minute

print(df.head(10))
print(f"Final dataset shape: {df.shape}")


original_rows = len(pd.read_csv(file_path, encoding='utf-8'))
final_rows = len(df)
print(f"Removed {original_rows - final_rows} rows out of {original_rows} total rows")


X = df.drop(columns=['temp', 'room_id/id', 'id', 'noted_date'])
Y = df['temp']

print(f"Features shape: {X.shape}")
print(f"Target shape: {Y.shape}")
print(f"Feature columns: {X.columns.tolist()}")
print(f"Feature dtypes:\n{X.dtypes}")


x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f"Training data shape: {x_train.shape}")
print(f"Test data shape: {x_test.shape}")


categorical_columns = ['out/in']
numerical_columns = ['year', 'month', 'day', 'day_of_week', 'hour', 'minute']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_columns)
    ])


rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)




from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")


feature_names = (numerical_columns + 
                list(rf_model.named_steps['preprocessor']
                    .named_transformers_['cat']
                    .get_feature_names_out(categorical_columns)))

print(f"Feature names: {feature_names}")

Original dataset shape: (97606, 5)
After dropping NaN in noted_date: (97606, 5)
After fixing date formats: (97606, 5)
After removing invalid dates: (49944, 5)
After final datetime conversion: (49944, 5)
                                    id  room_id/id          noted_date  temp  \
0  __export__.temp_log_196134_bd201015  Room Admin 2018-12-08 09:30:00    29   
1  __export__.temp_log_196131_7bca51bc  Room Admin 2018-12-08 09:30:00    29   
2  __export__.temp_log_196127_522915e3  Room Admin 2018-12-08 09:29:00    41   
3  __export__.temp_log_196128_be0919cf  Room Admin 2018-12-08 09:29:00    41   
4  __export__.temp_log_196126_d30b72fb  Room Admin 2018-12-08 09:29:00    31   
5  __export__.temp_log_196125_b0fa0b41  Room Admin 2018-12-08 09:29:00    31   
6  __export__.temp_log_196121_01544d45  Room Admin 2018-12-08 09:28:00    29   
7  __export__.temp_log_196122_f8b80a9f  Room Admin 2018-12-08 09:28:00    29   
8  __export__.temp_log_196111_6b7a0848  Room Admin 2018-12-08 09:26:00    29 