# Noise Clean for Crude Oil Price Data

Import all the libraries and dataset required

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(r'C:\Users\HP\My Projects\Crude Oil.csv')
df.head()

### Exploratory Data Analysis (EDA)

In [None]:
# Display basic information about the dataset
df.info()
# Summary statistics of numerical columns
df.describe()

In [None]:
# Check for missing values
print("\nMissing Values:")
df.isnull().sum()

# Fill missing values with mean or median
df.fillna(df.mean(), inplace=True)  # If any missing values

In [None]:
# Check for duplicate rows
print("\nDuplicate Rows:")
df[df.duplicated()]
# Identify and remove duplicate rows if any
df = df.drop_duplicates()

### Visualize Outliers

In [None]:
# Step 1: Identify the numerical columns in the DataFrame
numerical_columns = df.select_dtypes(include=[np.number]).columns

# Step 2: Visualize Outliers using Box Plots
plt.figure(figsize=(10, 6))
df[numerical_columns].boxplot()
plt.title('Box Plot of Numerical Features')
plt.xticks(rotation=45)
plt.show()

# Step 3: Visualize Outliers using Scatter Plots (for each pair of numerical features)
plt.figure(figsize=(12, 8))
for i in range(len(numerical_columns)):
    for j in range(i + 1, len(numerical_columns)):
        plt.scatter(df[numerical_columns[i]], df[numerical_columns[j]])
        plt.xlabel(numerical_columns[i])
        plt.ylabel(numerical_columns[j])
        plt.title(f'Scatter Plot of {numerical_columns[i]} vs {numerical_columns[j]}')
        plt.show()

### Remove Outliers

In [None]:
# Remove outliers using the IQR method:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

### Data Standardization or Normalization (Optional)

In [None]:
# Standardize or normalize numerical features (example):
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()  # or MinMaxScaler()
df[['numerical_feature1', 'numerical_feature2']] = scaler.fit_transform(df[['numerical_feature1', 'numerical_feature2']])

### Feature Selection and Engineering (optional)
Select relevant features for analysis and model training

Engineer new features if necessary

### Handling Categorical Variables (optional)

In [None]:
# Convert categorical variables to numerical representations
# using techniques like one-hot encoding or label encoding
df = pd.get_dummies(df, columns=['categorical_column'])

### Data Smoothing (optional)

In [None]:
# Apply data smoothing techniques to reduce noisy fluctuations
# Example of moving average smoothing (window size = 3):
df['smoothed_column'] = df['column'].rolling(window=3).mean()

### Feature Scaling (optional)

In [None]:
# Scale numerical features to a similar range (if needed)
# Example of Min-Max scaling:
df['scaled_column'] = (df['column'] - df['column'].min()) / (df['column'].max() - df['column'].min())

### Feature Transformation (optional)

In [None]:
# Apply feature transformations (log, square root, polynomial, etc.)
# Example of log transformation:
df['transformed_column'] = np.log(df['column'])