# Academic Style for Part 1

In [1]:
# Importing necessary libraries for data cleaning and exploration
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
# Load the dataset
df = pd.read_csv('Data/QF632_Project_1.csv')

In [3]:
# Examine the structure and summary
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1038 entries, 0 to 1037
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1038 non-null   object 
 1   Signal     1038 non-null   float64
 2   Open       1038 non-null   float64
 3   High       1038 non-null   float64
 4   Low        1038 non-null   float64
 5   Close      1038 non-null   float64
 6   Adj Close  1038 non-null   float64
dtypes: float64(6), object(1)
memory usage: 56.9+ KB
None
            Signal         Open         High          Low        Close  \
count  1038.000000  1038.000000  1038.000000  1038.000000  1038.000000   
mean     16.766190   141.847360   142.691801   140.907746   141.840973   
std       3.095783    18.475574    18.470255    18.404504    18.497010   
min       0.000000    94.080002    95.400002    93.639999    94.790001   
25%      14.691150   132.132496   132.912495   130.542503   131.824993   
50%      17.298240   146.769997 

In [4]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)

Missing values per column:
 Date         0
Signal       0
Open         0
High         0
Low          0
Close        0
Adj Close    0
dtype: int64


In [5]:
# Check for duplicates
duplicates = df.duplicated().sum()
print("Number of duplicate rows:", duplicates)

Number of duplicate rows: 0


In [6]:
# Ensure dates are in the right format and sequence
date_check = df['Date'].apply(pd.to_datetime, errors='coerce').isnull().sum()
print(f"Invalid dates: {date_check}")

# Check for date sequence issues
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
date_order_issues = (df['Date'] != df['Date'].sort_values()).any()
if date_order_issues:
    print("Dates are out of order")
else:
    print("Dates are in correct order")


Invalid dates: 0
Dates are in correct order


In [7]:
# Ensure prices and signals are non-zero and positive

price_columns_to_check = ['High', 'Low', 'Close', 'Adj Close']
price_check = {col: (df[col] <= 0).sum() for col in price_columns_to_check}

signal_check = (df['Signal'] <= 0).sum()

print(f"Negative/zero historical prices: {price_check}")
print(f"Negative/zero signal values: {signal_check}")

Negative/zero historical prices: {'High': 0, 'Low': 0, 'Close': 0, 'Adj Close': 1}
Negative/zero signal values: 6


In [8]:
# Correcting negative/zero historical prices and signal values
columns_to_check = price_columns_to_check + ['Signal']

# Remove rows with zero values in the specified columns
for col in columns_to_check:
    df = df[df[col] != 0]
    
# Turn negative prices/values to positive
for col in columns_to_check:
    df[col] = df[col].abs()

In [9]:
# Verify changes made
price_check = {col: (df[col] <= 0).sum() for col in price_columns_to_check}

signal_check = (df['Signal'] <= 0).sum()

print(f"Negative/zero historical prices: {price_check}")
print(f"Negative/zero signal values: {signal_check}")

Negative/zero historical prices: {'High': 0, 'Low': 0, 'Close': 0, 'Adj Close': 0}
Negative/zero signal values: 0


In [10]:
# Examine updated data structure and summary
print("Updated Data Summary:")
print(df.info())
print(df.describe())

Updated Data Summary:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1032 entries, 0 to 1031
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       1032 non-null   datetime64[ns]
 1   Signal     1032 non-null   float64       
 2   Open       1032 non-null   float64       
 3   High       1032 non-null   float64       
 4   Low        1032 non-null   float64       
 5   Close      1032 non-null   float64       
 6   Adj Close  1032 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 64.5 KB
None
            Signal         Open         High          Low        Close  \
count  1032.000000  1032.000000  1032.000000  1032.000000  1032.000000   
mean     16.863667   141.710320   142.554951   140.771376   141.703198   
std       2.827404    18.441019    18.436117    18.370474    18.461935   
min      10.582996    94.080002    95.400002    93.639999    94.790001   
25%      14.80