# Major US Oil and Gas Stock Price EDA


This script performs exploratory data analysis (EDA) on the stock prices of major US oil and gas companies.
 Below is a breakdown of the process and key steps, along with some improvements and potential additional analysis you might consider.

# Importing Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import seaborn as sns
import os

plt.style.use("seaborn-dark-palette")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\User\anaconda3\ttt adac\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\User\anaconda3\ttt adac\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "c:\Users\User\anaconda3\ttt adac\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\User\anaconda3\ttt adac\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
    app.start()
  File "c:\Use

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\User\anaconda3\ttt adac\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\User\anaconda3\ttt adac\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "c:\Users\User\anaconda3\ttt adac\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\User\anaconda3\ttt adac\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
    app.start()
  File "c:\Use

AttributeError: _ARRAY_API not found

ImportError: cannot import name 'cbook' from partially initialized module 'matplotlib' (most likely due to a circular import) (c:\Users\User\anaconda3\ttt adac\lib\site-packages\matplotlib\__init__.py)

# Loading the Dataset
You load the dataset into a pandas DataFrame and examine the structure of the data:

In [None]:
df = pd.read_csv(r"C:\Users\User\Documents\IRON HACK DA 2024\IH-Labs W8\Final-Project\Data\Raw data\oil and gas stock prices.csv")
df

In [None]:
# Display the first few rows of the dataframe
df.head()

In [None]:
# Display the last few rows of the dataframe
df.tail()


In [None]:
# Display the shape of the dataframe
df.shape

In [None]:
df.columns

In [None]:
df.nunique()

In [None]:
df.value_counts()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().value_counts()

In [None]:
df['Volume']=df['Volume'].astype(float)

In [None]:
df.dtypes

In [None]:
# Data Information
df.info()

In [None]:
df['Date'].unique()

In [None]:
df['Symbol'].unique()

In [None]:
df['Open'].unique()

In [None]:
df['High'].unique()

In [None]:
df['Low'].unique()

In [None]:
df['Volume'].unique()

In [None]:
df['Currency'].unique()

In [None]:
# Summary Statistics
df.describe().style.background_gradient(cmap = "cividis")


# Overview of the Dataset
1. The dataset contains stock price data for several major US oil and gas companies.
2. The companies include Exxon Mobil (XOM), Chevron (CVX), ConocoPhillips (COP), and others.

# Visualizing Stock Trading Data

The script uses Plotly to visualize the distribution of trading attributes (e.g., Open, High, Volume) for each company over time:

In [None]:
# Histogram of trading attributes
for y in ["Open", "High", "Volume"]: 
    fig = px.histogram(df, x="Date", y=y, color="Symbol",
                       color_discrete_sequence=px.colors.qualitative.Set2,
                       title=f"Total Trading {y} Distribution of Major US Oil Companies")
    fig.update_layout(template="plotly_dark", font=dict(family="PT Sans", size=18))
    fig.show()

# Visualizing All Attributes Together
You visualize all the attributes in the DataFrame as subplots, which is a quick way to inspect trends

In [None]:

df.plot(subplots=True, figsize=(12, 12), linewidth=1.5)
plt.title("US Oil and Gas Stock Attributes")
plt.show()

# Volume Trend Visualization
A bar plot is used to visualize the trend of trading volume over time

In [None]:

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(df["Date"], df["Volume"])
ax.xaxis.set_major_locator(plt.MaxNLocator(15))
ax.set_xlabel("Date", fontsize=5)
ax.set_ylabel("Volume", fontsize=5)
plt.title('Volume Trend', fontsize=20)
plt.grid()
plt.show()

In [None]:
!pip install autoviz


In [None]:
!pip show autoviz


In [None]:
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()




In [None]:
df_av = AV.AutoViz('Data\Raw data\oil and gas stock prices.csv')

# Moving Averages
You calculate and visualize the 20-day and 50-day moving averages of the opening prices

In [None]:
df["MA for 20 days"] = df["Open"].rolling(20).mean()
df["MA for 50 days"] = df["Open"].rolling(50).mean()
df.truncate(before="2010-01-01", after="2022-06-10")[["Close", "MA for 20 days", "MA for 50 days"]].plot(subplots=False, figsize=(12, 6), linewidth=2)
plt.grid()
plt.show()

# Distribution of Opening Prices
Finally, you use Seaborn to plot the distribution of opening prices

In [None]:

sns.distplot(df["Open"], color="#FFD500")
plt.title("Distribution of open prices of US Oil and Gas stocks", fontweight="bold", fontsize=20)
plt.xlabel("Open Price", fontsize=10)


In [None]:
count=1
plt.subplots(figsize=(30,25))
for i in df.columns:
    if df[i].dtypes!='object':
        plt.subplot(6,7,count)
        sns.distplot(df[i])
        count+=1

plt.show()

In [None]:
count=1
plt.subplots(figsize=(30,25))
for i in df.columns:
    if df[i].dtypes!='object':
        plt.subplot(6,7,count)
        sns.boxplot(df[i])
        count+=1

plt.show()

# Summary Statistics
You print out the maximum and minimum opening prices

In [None]:
# Summary Statistics
print("Maximum open price of stock ever obtained:", df["Open"].max())
print("Minimum open price of stock ever obtained:", df["Open"].min())

# Missing Data Handling
First, check for any missing data and decide how to handle it:


If the amount of missing data is small, you might opt to drop those rows.

For larger gaps, consider forward or backward filling, or more sophisticated methods like interpolation.

In [None]:
# Check for missing values
missing_data = df.isnull().sum()
print(missing_data)

In [None]:
# Visualize missing data
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")

In [None]:
# Drop rows with missing values
df_cleaned = df.dropna()

# Correlation Analysis

Explore how the stock attributes correlate with each other. This can help identify patterns or redundancy in the data.

Interpretation:

Look for high correlations that might indicate multicollinearity.   

Investigate any unexpected correlations that could lead to new insights.

In [None]:

# Calculate correlation matrix
corr_matrix = df.corr()

In [None]:
# Visualize correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix of Stock Attributes", fontsize=18)
plt.show()

# Time Series Analysis
Decompose the time series data to analyze trends, seasonality, and residuals.

In [None]:
# Decompose the time series data
from statsmodels.tsa.seasonal import seasonal_decompose
decomposed = seasonal_decompose(df['Close'], model='multiplicative', period=365)


In [None]:
# Plot the decomposed components
decomposed.plot()
plt.suptitle('Time Series Decomposition of Close Price', fontsize=18)
plt.show()

# Comparison of Companies

Use visualizations to compare stock price trends, volatilities, and other key metrics across different companies.

In [None]:
# Closing Price Trends
fig = px.line(df, x="Date", y="Close", color="Symbol", title="Closing Price Trends of Major US Oil Companies")
fig.update_layout(template="plotly_dark", font=dict(family="PT Sans", size=18))
fig.show()

In [None]:
# Boxplot to Compare Volatilities
plt.figure(figsize=(12, 6))
sns.boxplot(x="Symbol", y="Close", data=df)
plt.title("Volatility Comparison of Closing Prices", fontsize=18)
plt.show()

Insights:

Identify which companies have the most stable or volatile stock prices.

Compare the overall performance of different companies.

# Outlier Detection
Identify and visualize any outliers in the data

In [None]:
# Boxplot to Identify Outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=df[['Open', 'High', 'Low', 'Close', 'Volume']])
plt.title('Outlier Detection in Stock Attributes', fontsize=18)
plt.show()

In [None]:
# Using Z-scores for Outlier Detection
from scipy import stats

z_scores = np.abs(stats.zscore(df[['Open', 'High', 'Low', 'Close', 'Volume']]))
outliers = np.where(z_scores > 3)
print(f"Outliers Detected:\n{outliers}")

 Handling Outliers:

Decide whether to remove or adjust outliers based on their impact on the analysis.

# Interactive Dashboards with Dash
Consider building a Dash app for more interactive exploration.
Benefits:

Users can select different companies to compare.

Provides a more interactive way to explore the data.

In [None]:
# Installation of Dash (if not already installed)
!pip install jupyter-dash



In [None]:
from jupyter_dash import JupyterDash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px

# Initialize the JupyterDash app
app = JupyterDash(__name__)

# Layout of the app
app.layout = html.Div([
    dcc.Graph(id='price-time-series'),
    dcc.Dropdown(
        id='company-selector',
        options=[{'label': symbol, 'value': symbol} for symbol in df['Symbol'].unique()],
        value='XOM',  # Default value
        multi=True
    )
])

# Callback for updating the time series based on selected company
@app.callback(
    Output('price-time-series', 'figure'),
    Input('company-selector', 'value')
)
def update_graph(selected_symbols):
    if isinstance(selected_symbols, str):
        selected_symbols = [selected_symbols]
        
    filtered_df = df[df['Symbol'].isin(selected_symbols)]
    fig = px.line(filtered_df, x='Date', y='Close', color='Symbol')
    return fig

# Run the app in Jupyter notebook
app.run_server(mode='inline', port=8090)
