In [32]:
import pandas as pd

# Load the dataset
df = pd.read_excel("carsdatabase.xlsx")

# Display the first few rows of the dataset to understand its structure
print(df.head())

# 1. Remove columns that are irrelevant for prediction
# Let's assume we want to predict the price of cars, so we remove columns that are not related
columns_to_remove = ['Car_ID', 'Color']
columns_present = [col for col in columns_to_remove if col in df.columns]
df.drop(columns=columns_present, inplace=True)

# 2. Convert string values to numbers for a column
# Let's assume we have a 'Fuel_Type' column with categorical values that we want to convert to numbers
# Here's one way to do it using pandas' get_dummies() function
if 'Fuel_Type' in df.columns:
    fuel_type_dummies = pd.get_dummies(df['Fuel_Type'], prefix='Fuel')
    df = pd.concat([df, fuel_type_dummies], axis=1)
    df.drop(columns=['Fuel_Type'], inplace=True)

# 3. Remove or change records with no values
# Let's assume we want to remove records with missing values
df.dropna(inplace=True)

# 4. Create new columns based on other columns
# Let's assume we want to create a new column 'Age' based on the 'Year' column
current_year = 2024
if 'Year' in df.columns:
    df['Age'] = current_year - df['Year']
    df.drop(columns=['Year'], inplace=True)  # We can drop 'Year' column after calculating age

# Display the modified dataset after feature engineering
print(df.head())

# Now you can proceed with further data preprocessing and training your machine learning algorithm


   Unnamed: 0 Manufacturer    Model     Type  Min.Price  Price  Max.Price  \
0           1        Acura  Integra    Small       12.9   15.9       18.8   
1           2        Acura   Legend  Midsize       29.2   33.9       38.7   
2           3         Audi       90  Compact       25.9   29.1       32.3   
3           4         Audi      100  Midsize       30.8   37.7       44.6   
4           5          BMW     535i  Midsize       23.7   30.0       36.2   

   MPG.city  MPG.highway             AirBags  ... Passengers Length  \
0        25           31                None  ...          5    177   
1        18           25  Driver & Passenger  ...          5    195   
2        20           26         Driver only  ...          5    180   
3        19           26  Driver & Passenger  ...          6    193   
4        22           30         Driver only  ...          4    186   

   Wheelbase  Width  Turn.circle  Rear.seat.room Luggage.room  Weight  \
0        102     68           37     

In [33]:
# Identify columns with mixed data types
mixed_type_cols = [col for col in X_train.columns if X_train[col].dtype != X_test[col].dtype]

# Convert mixed type columns to strings (assuming they are categorical)
for col in mixed_type_cols:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

# Now, proceed with the encoding process


In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# Load the dataset (replace this with your own dataset)
data = load_iris()
X, y = data.data, data.target

# Define the Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Define the hyperparameters for tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Perform GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

# Best hyperparameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Accuracy of the best model
best_accuracy = grid_search.best_score_
print("Best Accuracy:", best_accuracy)


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy: 0.9666666666666668


In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [38]:
print(X_train.head())

    Unnamed: 0 Manufacturer     Type  Min.Price  Price  Max.Price  MPG.city  \
65          66       Nissan      Van       16.7   19.1       21.5        17   
15          16    Chevrolet      Van       14.7   16.3       18.0        18   
68          69   Oldsmobile  Midsize       14.2   16.3       18.4        23   
78          79       Saturn    Small        9.2   11.1       12.9        28   
30          31         Ford    Small        6.9    7.4        7.9        31   

    MPG.highway      AirBags DriveTrain  ... Passengers  Length  Wheelbase  \
65           23         None      Front  ...          7     190        112   
15           23         None      Front  ...          7     178        110   
68           31  Driver only      Front  ...          5     190        105   
78           38  Driver only      Front  ...          5     176        102   
30           33         None      Front  ...          4     141         90   

    Width  Turn.circle Rear.seat.room  Luggage.room  Wei

In [40]:
print(y_train.head())

65    20.0
15    20.0
68    16.5
78    12.8
30    10.0
Name: Fuel.tank.capacity, dtype: float64


In [39]:
print(logistic_regression)

LogisticRegression()
