In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv(r"FAANG.csv")
df

In [None]:
df.columns

In [None]:
df.info()

In [None]:
print(df.to_string())

In [None]:
df.isnull().sum()

**Empty columns are removed**

In [None]:
df.drop(columns=["Revenue","Gross Profit","Operating Income","Cash Ratio","Total Assets","Total Equity","Trailing Twelve Months (TTM) Revenue","Trailing Twelve Months (TTM) EBITDA","Trailing Twelve Months (TTM) Earnings"],inplace = True)
df


**Change the date format**

In [None]:
df["Date"] = pd.to_datetime(df["Date"])
df

In [None]:
df.isnull().sum()

**Replacing the NaN values for these columns with their respective mean**

In [None]:
#column names into list
columns_to_fill = ["Beta", "Dividends Paid", "Dividend Yield", "Beta (5Y)", "Annual Dividend Rate"]

# Fill NaN values in each specified column with the column's mean
for column in columns_to_fill:
    df[column] = df[column].fillna(df[column].mean())
df

**Mean values filled**

In [None]:
df[["Beta", "Dividends Paid", "Dividend Yield", "Beta (5Y)", "Annual Dividend Rate"]]

**no null values**

In [None]:
df.isnull().sum()

In [None]:
df.columns

**Finding unique values for these three columns,remaining columns are numerical.**

**There is no need to use Mode Imputation method(Fill with the most frequent value) for these categorical columns, because there is no none value in these below columns.**

In [None]:
df["Company"].unique()

In [None]:
df["Ticker"].unique()

In [None]:
df["Analyst Recommendation"].unique()

**One_Hot_Encoding**

**Encoding the categorical columns to change into numerical columns..in this dataset we have only three categorical columns named Company,Ticker and 
Analyst Recommendation. The Company and Ticker columns both are nominal values because,They are just names or labels without any inherent order or ranking.
Analyst Recommendation is also nominal because it doesn't have an inherent order or ranking between values like "buy", "sell", or "hold".But, if it had something like - "Strong Buy", "Buy", "Hold", "Sell" Then, it could be considered ordinal because there's a natural order (stronger to weaker recommendation).In this case, nominal is the right category for Analyst Recommendation.**

In [17]:
"""This code will create separate columns for each unique value in the 'Company' and 'Ticker' columns (like 'Company_Apple', 'Company_Facebook', etc.).
drop_first=False keeps all categories as separate columns, which is fine, 
though sometimes it’s recommended to set drop_first=True to avoid the dummy variable trap (multicollinearity)."""
# Apply One-Hot Encoding for 'Company' and 'Ticker'
df = pd.get_dummies(df, columns=['Company', 'Ticker'], drop_first=False)

# Select only the one-hot encoded columns and convert them to integers
df[df.columns[df.dtypes == 'bool']] = df[df.columns[df.dtypes == 'bool']].astype(int)

"""Since all values in the 'Analyst Recommendation' column are "buy", you can replace this column with a constant value
no need to use Label encoding because it is not necesarry"""
# Optionally replace 'Analyst Recommendation' with a constant value (e.g., 0)
df['Analyst Recommendation'] = 0


**Re-Ordering the columns in data frame**

In [18]:
# Automatically get the list of encoded columns (those that start with 'Company_' or 'Ticker_')
encoded_columns = [col for col in df.columns if col.startswith('Company_') or col.startswith('Ticker_')]

# Get the remaining columns (those that are not one-hot encoded)
remaining_columns = [col for col in df.columns if col not in encoded_columns]

# Reorder the DataFrame with encoded columns first
df = df[encoded_columns + remaining_columns]

In [None]:
print(df.to_string())

In [None]:
df.dropna(inplace = True)
df

**correlation checking**

In [None]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Avoid line breaks
pd.set_option('display.expand_frame_repr', False)  # Prevent truncation

print(df.corr())

In [None]:
plt.figure(figsize = (27,20))
sns.heatmap(df.corr(), annot = True)
plt.show()

**these columns wont do any significane difference in prediction so i removed it**

In [23]:
df.drop(columns=[
    "Open",
    "High",
    "Low",
    "Analyst Recommendation",
    "Adj Close",
    "Ticker_AAPL",
    "Ticker_AMZN",
    "Ticker_GOOGL",
    "Ticker_META",
    "Ticker_NFLX",
    "Beta"
],inplace=True)

**EDA**

**Data Visualization**

In [None]:
def data_visualization(df, column):
    # Check if the column is numeric before plotting the boxplot
    if df[column].dtype in ['float64', 'int64']:  # Only plot boxplot for numeric columns
        # Line Chart: Show trends in stock prices over time
        plt.figure(figsize=(15, 4))
        plt.subplot(1, 4, 1)
        plt.plot(df['Date'], df[column], label=f'{column} Trend', color='blue')
        plt.title(f"Line Chart for {column}")
        plt.xlabel('Date')
        plt.ylabel(f'{column}')
        plt.xticks(rotation=45)
        plt.legend()

        # Box Plot: Detect outliers in price and volume
        plt.subplot(1, 4, 2)
        sns.boxplot(x=df[column], color='orange')
        plt.title(f"Box Plot for {column}")

        # Histogram: Distribution of values for the column
        plt.subplot(1, 4, 3)
        sns.histplot(df[column], kde=True, bins=30, color='salmon')
        plt.title(f"Histogram for {column}")

        # Scatter Plot: Visualize relationships between stock prices and volume
        plt.subplot(1, 4, 4)
        sns.scatterplot(x=df[column], y=df['Volume'], color='green')
        plt.title(f"Scatter Plot for {column}")
        plt.xlabel(f'{column}')
        plt.ylabel('Volume')

        plt.tight_layout()
        plt.show()
    else:
        # If the column is not numeric, skip the boxplot and scatter plot
        print(f"Skipping {column} because it is not numeric.")

# To apply to all columns:
columns = df.columns  # List all columns
for column in columns:
    data_visualization(df, column)



**Outliers_handling**

In [None]:
# If you want to keep the original df and compare changes, use df1
df1 = df.copy()  # Create a copy of df for comparison
# Define the numeric columns to check for outliers
numeric_columns = ['Close', 'Volume']

# Function to remove outliers using IQR for a single column (with added print statements)
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)  # 25th percentile (Q1)
    Q3 = df[column].quantile(0.75)  # 75th percentile (Q3)
    IQR = Q3 - Q1  # Interquartile Range
    lower_bound = Q1 - 1.5 * IQR  # Lower limit
    upper_bound = Q3 + 1.5 * IQR  # Upper limit

    # Print out the values for debugging
    print(f"Column: {column}")
    print(f"Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
    print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")

    # Filter out rows where the value is outside the bounds
    filtered_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    # Print number of rows before and after filtering
    print(f"Rows before: {len(df)}, Rows after: {len(filtered_df)}")
    
    return filtered_df

# Apply the function to check for outliers
for column in numeric_columns:
    df1 = remove_outliers_iqr(df1, column)

In [None]:
for column in numeric_columns:
    data_visualization(df1, column)

**We are going to calculate the Z-score for each value in the numeric columns and filter out those with Z-scores greater than 3 or less than -3.
We keep only those rows where the Z-score is between -3 and 3, which removes data points that are too far from the mean (outliers).
We already filtered the outliers using IQR and now we are filtering with Z-score.**

In [None]:
from scipy import stats

# Define the numeric columns to apply Z-score
numeric_columns = ['Close', 'Volume']

# Function to remove outliers using Z-score
def remove_outliers_zscore(df, column, threshold=3):
    # Calculate Z-scores for the column
    z_scores = stats.zscore(df[column])
    
    # Print out the mean and std for debugging
    print(f"Column: {column}")
    print(f"Mean: {df[column].mean()}, Std: {df[column].std()}")
    
    # Filter out rows where Z-score is greater than threshold (outliers)
    df_no_outliers = df[(z_scores < threshold) & (z_scores > -threshold)]
    
    # Print number of rows before and after filtering
    print(f"Rows before: {len(df)}, Rows after: {len(df_no_outliers)}")
    
    return df_no_outliers

# Apply the function to remove outliers using Z-score
for column in numeric_columns:
    df1 = remove_outliers_zscore(df1, column)


In [None]:
print(df1.to_string())

**re-order index**

In [29]:
df1.reset_index(drop=True, inplace=True)

In [None]:
print(df1.to_string())

In [31]:
# Save the cleaned DataFrame to a CSV file
df1.to_csv('cleaned_data.csv', index=False)