# 1. Simple example for Feature engineering-(Including Training set Test set)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create a sample dataset
data = {
    'Feature1': [1, 2, 3, 4, 5],
    'Feature2': [5, 4, 3, 2, 1],
    'Target': [0, 1, 0, 1, 0]
}

df = pd.DataFrame(data)

In [2]:
# Original dataset
print("Original Dataset:")
print(df)

Original Dataset:
   Feature1  Feature2  Target
0         1         5       0
1         2         4       1
2         3         3       0
3         4         2       1
4         5         1       0


In [3]:
# Feature engineering: Create a new feature by combining Feature1 and Feature2
df['NewFeature'] = df['Feature1'] + df['Feature2']

# Updated dataset with the new feature
print("\nDataset with New Feature:")
print(df)


Dataset with New Feature:
   Feature1  Feature2  Target  NewFeature
0         1         5       0           6
1         2         4       1           6
2         3         3       0           6
3         4         2       1           6
4         5         1       0           6


In [4]:
# Split the dataset into features (X) and target variable (y)
X = df[['Feature1', 'Feature2', 'NewFeature']]
y = df['Target']

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple model (Random Forest) on the updated dataset
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy}")


Model Accuracy: 0.0


# 2. Time Date feature extraction example

In [6]:
import pandas as pd
from datetime import datetime

# Create a sample dataset with a timestamp column
data = {
    'Timestamp': ['2023-01-15 08:30:00', '2023-01-15 14:45:00', '2023-01-16 20:15:00', '2023-01-17 12:00:00']
}

df = pd.DataFrame(data)

# Convert the 'Timestamp' column to datetime format
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Feature engineering: Extract features from the timestamp
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek  # Monday is 0 and Sunday is 6
df['HourOfDay'] = df['Timestamp'].dt.hour
df['IsWeekend'] = df['Timestamp'].dt.weekday // 5  # 1 for Saturday or Sunday, 0 for other days

# Display the updated dataset with new features
print("Updated Dataset with New Features:")
print(df)

Updated Dataset with New Features:
            Timestamp  DayOfWeek  HourOfDay  IsWeekend
0 2023-01-15 08:30:00          6          8          1
1 2023-01-15 14:45:00          6         14          1
2 2023-01-16 20:15:00          0         20          0
3 2023-01-17 12:00:00          1         12          0


# 3. Simple example dataset with product information - Feature creation

In [9]:
import pandas as pd

# Create a sample dataset with product information
data = {
    'ProductID': [1, 2, 3, 4],
    'ProductName': ['Laptop', 'Smartphone', 'Headphones', 'Tablet'],
    'Price': [1200, 800, 150, 500],
    'StockQuantity': [50, 100, 30, 80],
    'Category': ['Electronics', 'Electronics', 'Audio', 'Electronics']
}

df = pd.DataFrame(data)
print(df)
# Feature engineering: Create new features
df['PricePerUnit'] = df['Price'] / df['StockQuantity']  # Price per unit of stock
df['IsExpensive'] = df['Price'] > df['Price'].mean()  # Binary feature indicating if the product is expensive
df['CategoryCount'] = df.groupby('Category')['ProductID'].transform('count')  # Count of products in the same category

# Display the updated dataset with new features
print("Updated Dataset with New Features:")
print(df)

   ProductID ProductName  Price  StockQuantity     Category
0          1      Laptop   1200             50  Electronics
1          2  Smartphone    800            100  Electronics
2          3  Headphones    150             30        Audio
3          4      Tablet    500             80  Electronics
Updated Dataset with New Features:
   ProductID ProductName  Price  StockQuantity     Category  PricePerUnit  \
0          1      Laptop   1200             50  Electronics         24.00   
1          2  Smartphone    800            100  Electronics          8.00   
2          3  Headphones    150             30        Audio          5.00   
3          4      Tablet    500             80  Electronics          6.25   

   IsExpensive  CategoryCount  
0         True              3  
1         True              3  
2        False              1  
3        False              3  


# 4 Feature extraction from image

In [6]:
import cv2
from skimage.feature import hog
from skimage import exposure
# Load an example image
image = cv2.imread('example_image.jpg', cv2.IMREAD_GRAYSCALE)
# Calculate HOG features
hog_features, hog_image = hog(image, orientations=160, pixels_per_cell=(6, 6),
 cells_per_block=(1, 1), visualize=True)
# Enhance the HOG image for better visualization
hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))
# Display the original image and the HOG image
cv2.imshow('Original Image', image)
cv2.imshow('HOG Image', hog_image_rescaled)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [2]:
pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.8.1.78-cp37-abi3-win_amd64.whl (38.1 MB)
     ---------------------------------------- 38.1/38.1 MB 3.0 MB/s eta 0:00:00
Installing collected packages: opencv-python
Successfully installed opencv-python-4.8.1.78
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install --upgrade pip

Collecting pip
  Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 2.7 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.3.1
    Uninstalling pip-22.3.1:
      Successfully uninstalled pip-22.3.1
Successfully installed pip-23.3.1
Note: you may need to restart the kernel to use updated packages.


# Example to find Correlation

In [9]:
import pandas as pd

data = pd.DataFrame({'Age': [25, 30, 35, 40, 45],
                     'Income': [50000, 60000, 75000, 80000, 10000]})

# Calculate the correlation matrix
correlation_matrix = data.corr()

# Check the correlation between 'Age' and 'Income'
age_income_correlation = correlation_matrix.loc['Age', 'Income']

print(f'Correlation between Age and Income: {age_income_correlation}')


Correlation between Age and Income: -0.3407771005482389


# Date feature extraction

In [11]:
import pandas as pd
data={'date':['2023-11-22','2023-10-20','2023-09-23']}
df=pd.DataFrame(data)
print(df)

         date
0  2023-11-22
1  2023-10-20
2  2023-09-23


In [14]:
df['date']=pd.to_datetime(df['date'])
print(df)

        date
0 2023-11-22
1 2023-10-20
2 2023-09-23


In [15]:
df['day']=df['date'].dt.day
df['month']=df['date'].dt.month
df['year']=df['date'].dt.year
print(df)

        date  day  month  year
0 2023-11-22   22     11  2023
1 2023-10-20   20     10  2023
2 2023-09-23   23      9  2023


# Create new feature from dataframe

In [16]:
import pandas as pd
data={
    'ProductID':[1,2,3,4],
    'ProductName':['Lap','Phone','Headset','Tablet'],
    'Price':[1200,800,150,500],
    'StockQuantity':[50,100,30,80],
    'Category':['Elect','Elect','Audio','Elect']  
}
df=pd.DataFrame(data)
print(df)

   ProductID ProductName  Price  StockQuantity Category
0          1         Lap   1200             50    Elect
1          2       Phone    800            100    Elect
2          3     Headset    150             30    Audio
3          4      Tablet    500             80    Elect


In [17]:
df['Price per unit']=df['Price']/df['StockQuantity']
df['IsExpensive']=df['Price']>df['Price'].mean()
df['Category']=df.groupby('Category')['ProductID'].transform('count')
print("Updated values")
print(df)

Updated values
   ProductID ProductName  Price  StockQuantity  Category  Price per unit  \
0          1         Lap   1200             50         3           24.00   
1          2       Phone    800            100         3            8.00   
2          3     Headset    150             30         1            5.00   
3          4      Tablet    500             80         3            6.25   

   IsExpensive  
0         True  
1         True  
2        False  
3        False  
