In [1]:
from globals_and_helpers import (
    PROJECT_DIR,
    TEMP_OUTPUTS_DIR,
    FINAL_OUTPUTS_DIR,
 )
import os
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
from plotly.offline import iplot
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import statsmodels.api as sm
import matplotlib.pyplot as plt
print("Matplotlib imported successfully")
pio.templates.default = 'none'

datapath = os.path.join(PROJECT_DIR, 'final_outputs', '0.25 scaling v. tiled')

# Load the CSV files into dataframes
df1_unfiltered = pd.read_csv(os.path.join(datapath, '27620_0.25scale.csv'))
df2_unfiltered = pd.read_csv(os.path.join(datapath, '27620_tiled.csv'))
df2_unfiltered.head()

Matplotlib imported successfully


Unnamed: 0,Object ID,Object type,Classification,Mean Intensity,Unnamed: 4,Mean Intensity Normalized,Stiffness (log (E actual) PA)
0,f4dd3722-9b0b-45be-82b1-b5701390fb23,Annotation,LGD,117.4291,,0.460506,5.752603
1,25ed0478-a9f2-43cf-9c2f-ce5d7d535ab5,Annotation,LGD,100.5337,,0.39425,5.411505
2,57151830-f97e-40c8-816c-6fb8f179df44,Annotation,LGD,86.3118,,0.338478,5.124381
3,a4e29dc8-2683-40ed-8cdc-bc6d3a21c992,Annotation,LGD,119.819,,0.469878,5.800853
4,778a383f-2284-4ea9-9d2d-f7e9a4b88e4d,Annotation,LGD,112.1996,,0.439998,5.647026


In [2]:
df2 = df2_unfiltered[['Object ID', 'Object type', 'Classification', 'Stiffness (log (E actual) PA)']]
df1 = df1_unfiltered[['Object ID', 'Object type', 'Classification', 'Stiffness (log (E actual) PA)']]
df1.head()

Unnamed: 0,Object ID,Object type,Classification,Stiffness (log (E actual) PA)
0,7a0bd252-1529-4c0c-818a-4c171b75ea22,Annotation,LGD,6.848749
1,29e685e5-3482-447f-9160-93966e2c41e6,Annotation,LGD,6.985607
2,344fd8bc-fd5a-4da9-87eb-593c62c8c603,Annotation,LGD,6.41618
3,86ff07ca-9b93-4f62-98c2-34c39b8801c4,Annotation,LGD,6.895183
4,96eb9cec-1b6f-42cb-8487-58ec2e186404,Annotation,LGD,6.701627


In [3]:
df2.head()

Unnamed: 0,Object ID,Object type,Classification,Stiffness (log (E actual) PA)
0,f4dd3722-9b0b-45be-82b1-b5701390fb23,Annotation,LGD,5.752603
1,25ed0478-a9f2-43cf-9c2f-ce5d7d535ab5,Annotation,LGD,5.411505
2,57151830-f97e-40c8-816c-6fb8f179df44,Annotation,LGD,5.124381
3,a4e29dc8-2683-40ed-8cdc-bc6d3a21c992,Annotation,LGD,5.800853
4,778a383f-2284-4ea9-9d2d-f7e9a4b88e4d,Annotation,LGD,5.647026


In [4]:
# pd.set_option('display.max_rows', None)  # Show all rows
df_combine = pd.merge(df1, df2, on=['Object ID', 'Object type', 'Classification'], suffixes=(' (1/4 scaled)', ' (tiled)'))
df_combine.head()

Unnamed: 0,Object ID,Object type,Classification,Stiffness (log (E actual) PA) (1/4 scaled),Stiffness (log (E actual) PA) (tiled)
0,7a0bd252-1529-4c0c-818a-4c171b75ea22,Annotation,LGD,6.848749,5.368817
1,29e685e5-3482-447f-9160-93966e2c41e6,Annotation,LGD,6.985607,5.966017
2,344fd8bc-fd5a-4da9-87eb-593c62c8c603,Annotation,LGD,6.41618,5.126315
3,86ff07ca-9b93-4f62-98c2-34c39b8801c4,Annotation,LGD,6.895183,5.384763
4,96eb9cec-1b6f-42cb-8487-58ec2e186404,Annotation,LGD,6.701627,5.199659


In [5]:
# Perform an outer join to include all rows from both DataFrames
merged_df = pd.merge(df1, df2, on=['Object ID', 'Object type', 'Classification'], how='outer', suffixes=(' (1/4 scaled)', ' (tiled)'), indicator=True)

# Filter rows that did not merge
unmerged_rows = merged_df[merged_df['_merge'] != 'both']

# Display the unmerged rows
unmerged_rows.to_csv(os.path.join(datapath, '27620_unmerged_rows.csv'), index=False)

In [6]:
print(df1.describe())
print(df2.describe())

       Stiffness (log (E actual) PA)
count                      60.000000
mean                        6.526701
std                         0.314872
min                         6.013916
25%                         6.206861
50%                         6.579188
75%                         6.797549
max                         7.119532
       Stiffness (log (E actual) PA)
count                      81.000000
mean                        5.573291
std                         0.492706
min                         4.476879
25%                         5.229292
50%                         5.584434
75%                         5.855532
max                         7.121520


In [7]:
trace = go.Scatter(x=df1['Stiffness (log (E actual) PA)'], y=df2['Stiffness (log (E actual) PA)'], mode="markers")
data = [trace]
layout = go.Layout(title="Correlation of Stiffness Values", xaxis=dict(title="1/4 Scaled"), yaxis=dict(title="Tiled"))
iplot({"data": data, "layout": layout})

In [8]:
# px.scatter(df_combine, x='Stiffness (log (E actual) PA) (1/4 scaled)', y='Stiffness (log (E actual) PA) (tiled)')

# Create the scatter plot with 'Object ID' in hover data
fig = px.scatter(
    df_combine,
    x='Stiffness (log (E actual) PA) (1/4 scaled)',
    y='Stiffness (log (E actual) PA) (tiled)',
    hover_data=['Object ID'],  # Include 'Object ID' in hover data
    trendline="ols"  # Add a regression line
)

# Show the plot
fig.show()

In [9]:
trace0 = go.Histogram(
    x=df1['Stiffness (log (E actual) PA)'], 
    name='1/4 Scaled', 
    opacity=0.75,
    nbinsx=100
)
trace1 = go.Histogram(
    x=df2['Stiffness (log (E actual) PA)'], 
    name='Tiled', 
    opacity=0.75,
    nbinsx=100
)
data = [trace0, trace1]
layout = go.Layout(title="Distribution of Stiffness Values", xaxis=dict(title="Stiffness (log (E actual) PA)"), yaxis=dict(title="Frequency"))
iplot({"data": data, "layout": layout})

In [10]:
import plotly.graph_objs as go
from plotly.offline import iplot
import numpy as np
from scipy.stats import gaussian_kde

# Create histogram traces with a specified number of bins
trace0 = go.Histogram(
    x=df1['Stiffness (log (E actual) PA)'],
    name='1/4 Scaled',
    opacity=0.75,
    nbinsx=100  # Specify the number of bins
)
trace1 = go.Histogram(
    x=df2['Stiffness (log (E actual) PA)'],
    name='Tiled',
    opacity=0.75,
    nbinsx=100  # Specify the number of bins
)

# Create KDE traces
kde0 = go.Scatter(
    x=np.linspace(min(df1['Stiffness (log (E actual) PA)']), max(df1['Stiffness (log (E actual) PA)']), 200),
    y=gaussian_kde(df1['Stiffness (log (E actual) PA)'])(np.linspace(min(df1['Stiffness (log (E actual) PA)']), max(df1['Stiffness (log (E actual) PA)']), 200)),
    mode='lines',
    name='1/4 Scaled KDE',
    line=dict(color='blue', width=2)
)
kde1 = go.Scatter(
    x=np.linspace(min(df2['Stiffness (log (E actual) PA)']), max(df2['Stiffness (log (E actual) PA)']), 200),
    y=gaussian_kde(df2['Stiffness (log (E actual) PA)'])(np.linspace(min(df2['Stiffness (log (E actual) PA)']), max(df2['Stiffness (log (E actual) PA)']), 200)),
    mode='lines',
    name='Tiled KDE',
    line=dict(color='orange', width=2)
)

# Combine the traces into a data list
data = [trace0, trace1, kde0, kde1]

# Define the layout
layout = go.Layout(
    title="Distribution of Stiffness Values with KDE",
    xaxis=dict(title="Stiffness (log (E actual) PA)"),
    yaxis=dict(title="Frequency")
)

# Create the figure and plot
fig = go.Figure(data=data, layout=layout)
iplot(fig)


In [11]:
from scipy.stats import spearmanr

# Perform Spearman's Rank Correlation
correlation, p_value = spearmanr(
    df_combine['Stiffness (log (E actual) PA) (1/4 scaled)'], 
    df_combine['Stiffness (log (E actual) PA) (tiled)']
)

print(f"Spearman's Rank Correlation: {correlation}")
print(f"P-value: {p_value}")


Spearman's Rank Correlation: 0.17029967075373348
P-value: 0.1932883465170379


In [12]:
sp.stats.shapiro(df_combine['Stiffness (log (E actual) PA) (1/4 scaled)'])

ShapiroResult(statistic=np.float64(0.9308095568509709), pvalue=np.float64(0.002149692282858113))

In [14]:
sp.stats.shapiro(df_combine['Stiffness (log (E actual) PA) (tiled)'])

ShapiroResult(statistic=np.float64(0.9483822147442253), pvalue=np.float64(0.013081842674462905))

<b>Statistic (Pearson correlation coefficient):</b> 0.133177871846851<br>
<b>P-value:</b> 0.3103831678100672<br>
<br>
<b>Pearson Correlation Coefficient:</b><br>
<br>
The Pearson correlation coefficient (statistic) measures the linear relationship between two variables.
A value of 0.133177871846851 indicates a weak positive linear relationship between the two variables. The coefficient ranges from -1 to 1, where:
1 indicates a perfect positive linear relationship.
-1 indicates a perfect negative linear relationship.
0 indicates no linear relationship.<br>
<br>
<b>P-value:</b><br>
<br>
The p-value (pvalue) tests the null hypothesis that there is no correlation between the two variables.
A p-value of 0.3103831678100672 suggests that the correlation is not statistically significant at common significance levels (e.g., 0.05). This means there is not enough evidence to reject the null hypothesis, indicating that the observed correlation could be due to random chance.

In [18]:
stats.pearsonr(df_combine['Stiffness (log (E actual) PA) (1/4 scaled)'], df_combine['Stiffness (log (E actual) PA) (tiled)'])

PearsonRResult(statistic=np.float64(0.133177871846851), pvalue=np.float64(0.3103831678100672))

In [19]:
stats.linregress(df_combine['Stiffness (log (E actual) PA) (1/4 scaled)'], df_combine['Stiffness (log (E actual) PA) (tiled)'])

LinregressResult(slope=np.float64(0.21918480078614833), intercept=np.float64(4.0448631163295765), rvalue=np.float64(0.133177871846851), pvalue=np.float64(0.31038316781006703), stderr=np.float64(0.21417974880452773), intercept_stderr=np.float64(1.3994859269522266))

<b>1. Low $R^2$ Value:</b><br>
- An $R^2$ value of 0.0177 is very close to 0, which indicates that the independent variable (in this case, 'Stiffness (log (E actual) PA) (1/4 scaled)') explains only a very small proportion of the variance in the dependent variable ('Stiffness (log (E actual) PA) (tiled)').
- This suggests that the linear regression model does not fit the data well, and the relationship between the variables is weak.
<br>

<b>2. Goodness of Fit:</b><br>
The $R^2$ value ranges from 0 to 1, where:
- 1 indicates a perfect fit, meaning the model explains all the variability of the response data around its mean.
- 0 indicates that the model does not explain any of the variability of the response data around its mean.
An $R^2 value of 0.0177 is very low, indicating a poor fit.
<br>
<b>Implications</b>

<b>Weak Relationship:</b> The low $R^2$ value suggests that there is a weak linear relationship between the two variables. This means that changes in the independent variable do not strongly predict changes in the dependent variable.<br>
<b>Model Inadequacy:</b> The linear regression model may not be appropriate for capturing the relationship between these variables. You might need to consider other types of models or transformations of the data.<br>
<b>Data Quality:</b> Ensure that the data is clean and correctly preprocessed. Outliers or errors in the data can affect the $R^2$ value.

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Prepare the data
X = df_combine['Stiffness (log (E actual) PA) (1/4 scaled)'].values.reshape(-1, 1)
y = df_combine['Stiffness (log (E actual) PA) (tiled)'].values

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Make predictions
y_pred = model.predict(X)

# Calculate R^2
r2 = r2_score(y, y_pred)
print(f"R^2 value: {r2}")


R^2 value: 0.017736345549656418
