## Problem_1
### Loans_Data Hypothesis_Testing

In [96]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway, kruskal, chi2_contingency
import statsmodels.api as sm
from scipy.stats import ttest_rel

In [97]:
loansdata = pd.read_csv('LoansData.csv')

In [98]:
loansdata

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,20000.0,20000.00,8.90%,36 months,debt_consolidation,14.90%,SC,MORTGAGE,6541.67,735-739,14.0,14272.0,2.0,< 1 year
1,19200.0,19200.00,12.12%,36 months,debt_consolidation,28.36%,TX,MORTGAGE,4583.33,715-719,12.0,11140.0,1.0,2 years
2,35000.0,35000.00,21.98%,60 months,debt_consolidation,23.81%,CA,MORTGAGE,11500.00,690-694,14.0,21977.0,1.0,2 years
3,10000.0,9975.00,9.99%,36 months,debt_consolidation,14.30%,KS,MORTGAGE,3833.33,695-699,10.0,9346.0,0.0,5 years
4,12000.0,12000.00,11.71%,36 months,credit_card,18.78%,NJ,RENT,3195.00,695-699,11.0,14469.0,0.0,9 years
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,30000.0,29950.00,16.77%,60 months,debt_consolidation,19.23%,NY,MORTGAGE,9250.00,705-709,15.0,45880.0,1.0,8 years
2496,16000.0,16000.00,14.09%,60 months,home_improvement,21.54%,MD,OWN,8903.25,740-744,18.0,18898.0,1.0,10+ years
2497,10000.0,10000.00,13.99%,36 months,debt_consolidation,4.89%,PA,MORTGAGE,2166.67,680-684,4.0,4544.0,0.0,10+ years
2498,6000.0,6000.00,12.42%,36 months,major_purchase,16.66%,NJ,RENT,3500.00,675-679,8.0,7753.0,0.0,5 years


#### 2: Hypothesis Testing

#### Hypothesis a: Interest rate varies for different loan amounts.

In [99]:
loansdata['Interest.rate'] = loansdata['Interest.Rate'].str.replace('%', '').astype(float)

kruskal_stat, p_value = kruskal(*[group['Interest.Rate'] for name, group in loansdata.groupby('Amount.Requested')])

print(f"Kruskal-Wallis Test - Interest rate vs. Loan Amount:")
print(f"Kruskal-Wallis Statistic: {kruskal_stat}")
print(f"P-value: {p_value}")

alpha = 0.05
if p_value < alpha:
    print("Reject null hypothesis: Interest rates vary significantly for different loan amounts.")
else:
    print("Fail to reject null hypothesis: There is no significant difference in interest rates for different loan amounts.")


Kruskal-Wallis Test - Interest rate vs. Loan Amount:
Kruskal-Wallis Statistic: nan
P-value: nan
Fail to reject null hypothesis: There is no significant difference in interest rates for different loan amounts.


#### Hypothesis b: Loan length directly affects interest rate

In [100]:
loansdata['Loan.Length'] = loansdata['Loan.Length'].astype(str)

loansdata['Loan.Length'] = loansdata['Loan.Length'].str.replace(' months', '').astype(int)

loansdata['Interest.Rate'] = loansdata['Interest.Rate'].str.replace('%', '').astype(float)

loansdata['Intercept'] = 1

endog = loansdata['Interest.Rate'].values
exog = loansdata[['Intercept', 'Loan.Length']].values

model = sm.OLS(endog, exog)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Wed, 26 Jun 2024   Prob (F-statistic):                nan
Time:                        23:01:13   Log-Likelihood:                    nan
No. Observations:                2500   AIC:                               nan
Df Residuals:                    2498   BIC:                               nan
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const             nan        nan        nan        n

#### Hypothesis c: Interest rate varies for different purposes of loans.

In [101]:
anova_results = loansdata.groupby('Loan.Purpose')['Interest.rate'].apply(list).apply(np.array)
f_stat, p_value = f_oneway(*anova_results)

print(f"One-way ANOVA - Interest rate vs. Loan Purpose:")
print(f"F-statistic: {f_stat}")
print(f"P-value: {p_value}")

alpha = 0.05
if p_value < alpha:
    print("Reject null hypothesis: Interest rates vary significantly across different loan purposes.")
else:
    print("Fail to reject null hypothesis: There is no significant difference in interest rates across different loan purposes.")


One-way ANOVA - Interest rate vs. Loan Purpose:
F-statistic: nan
P-value: nan
Fail to reject null hypothesis: There is no significant difference in interest rates across different loan purposes.


#### Hypothesis d: There is a relationship between FICO scores and Home Ownership.

In [102]:
contingency_table = pd.crosstab(loansdata['FICO.Range'], loansdata['Home.Ownership'])

chi2_stat, p_value, _, _ = chi2_contingency(contingency_table)

print(f"Chi-square Test of Independence - FICO vs. Home Ownership:")
print(f"Chi-square Statistic: {chi2_stat}")
print(f"P-value: {p_value}")

alpha = 0.05
if p_value < alpha:
    print("Reject null hypothesis: There is a relationship between FICO scores and Home Ownership.")
else:
    print("Fail to reject null hypothesis: There is no significant relationship between FICO scores and Home Ownership.")


Chi-square Test of Independence - FICO vs. Home Ownership:
Chi-square Statistic: 473.05246368346025
P-value: 1.2021592010244615e-35
Reject null hypothesis: There is a relationship between FICO scores and Home Ownership.


## Problem_2
### Price_Quotes_Hypothesis_Testing

In [103]:
price_data = pd.read_csv('Price_Quotes.csv')

In [104]:
price_data

Unnamed: 0,Order_Number,Barry_Price,Mary_Price
0,1,126,114
1,2,110,118
2,3,138,114
3,4,142,111
4,5,146,129
5,6,136,119
6,7,94,97
7,8,103,104
8,9,140,127
9,10,152,133


In [105]:
mary_quotes = price_data['Mary_Price']
barry_quotes = price_data['Barry_Price']

t_stat, p_value = ttest_rel(mary_quotes, barry_quotes)

print(f"Paired t-test - Mary vs. Barry Price Quotes:")
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

alpha = 0.05
if p_value < alpha:
    print("Reject null hypothesis: There is a significant difference in the average price quotes provided by Mary and Barry.")
else:
    print("Fail to reject null hypothesis: There is no significant difference in the average price quotes provided by Mary and Barry.")


Paired t-test - Mary vs. Barry Price Quotes:
T-statistic: -2.5213765108923494
P-value: 0.02840588045242053
Reject null hypothesis: There is a significant difference in the average price quotes provided by Mary and Barry.


## Problem_3
#### Treatment_Facility_Hypothesis_Testing

In [106]:
treatment_data = pd.read_csv('Treatment_Facility.csv')
treatment_data.rename(columns={'VAR4': 'TRFF(%)', 'VAR5': 'CI(%)'}, inplace=True)

In [107]:
treatment_data

Unnamed: 0,Month,Reengineer,Employee_Turnover,TRFF(%),CI(%)
0,1,Prior,0.0,24.390244,42.682927
1,2,Prior,6.0606,19.354839,25.806452
2,3,Prior,12.1212,35.087719,146.19883
3,4,Prior,3.3333,18.404908,110.429448
4,5,Prior,12.9032,17.964072,23.952096
5,6,Prior,9.6774,41.176471,47.058824
6,7,Prior,11.7647,13.422819,0.0
7,8,Prior,11.4286,31.25,25.0
8,9,Prior,23.0769,17.241379,132.183908
9,10,Prior,15.0,16.574586,16.574586


In [108]:
from scipy.stats import ttest_ind

prior_data = treatment_data[treatment_data['Reengineer'] == 'Prior']
post_data = treatment_data[treatment_data['Reengineer'] == 'Post']

employee_turnover_prior = prior_data['Employee_Turnover']
employee_turnover_post = post_data['Employee_Turnover']
trff_prior = prior_data['TRFF(%)']
trff_post = post_data['TRFF(%)']
ci_prior = prior_data['CI(%)']
ci_post = post_data['CI(%)']

t_stat_employee, p_value_employee = ttest_ind(employee_turnover_prior, employee_turnover_post)
t_stat_trff, p_value_trff = ttest_ind(trff_prior, trff_post)
t_stat_ci, p_value_ci = ttest_ind(ci_prior, ci_post)

print(f"Paired t-test - Employee Turnover (Prior vs. Post):")
print(f"T-statistic: {t_stat_employee}")
print(f"P-value: {p_value_employee}")

print(f"Paired t-test - TRFF (%) (Prior vs. Post):")
print(f"T-statistic: {t_stat_trff}")
print(f"P-value: {p_value_trff}")

print(f"Paired t-test - CI (%) (Prior vs. Post):")
print(f"T-statistic: {t_stat_ci}")
print(f"P-value: {p_value_ci}")

alpha = 0.05
print("\nInterpretation:")
if p_value_employee < alpha:
    print("Employee Turnover: There is a significant difference in the employee turnover rate before and after reengineering.")
else:
    print("Employee Turnover: There is no significant difference in the employee turnover rate before and after reengineering.")

if p_value_trff < alpha:
    print("TRFF (%): There is a significant difference in the temporary removal rate before and after reengineering.")
else:
    print("TRFF (%): There is no significant difference in the temporary removal rate before and after reengineering.")

if p_value_ci < alpha:
    print("CI (%): There is a significant difference in the critical incident rate before and after reengineering.")
else:
    print("CI (%): There is no significant difference in the critical incident rate before and after reengineering.")


Paired t-test - Employee Turnover (Prior vs. Post):
T-statistic: -1.770306704753604
P-value: 0.09361109345535304
Paired t-test - TRFF (%) (Prior vs. Post):
T-statistic: 2.783398448880451
P-value: 0.012265082138734354
Paired t-test - CI (%) (Prior vs. Post):
T-statistic: 1.627914425352865
P-value: 0.12091989189884148

Interpretation:
Employee Turnover: There is no significant difference in the employee turnover rate before and after reengineering.
TRFF (%): There is a significant difference in the temporary removal rate before and after reengineering.
CI (%): There is no significant difference in the critical incident rate before and after reengineering.


## Problem_4
### Priority_Assessment_Hypothesis_testing

In [109]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
import statsmodels.api as sm
from statsmodels.formula.api import ols


priority_data = pd.read_csv('Priority_Assessment.csv')

In [110]:
priority_data

Unnamed: 0,Days,Priority
0,3.3,High
1,7.9,Medium
2,0.3,High
3,0.7,Medium
4,8.6,Medium
...,...,...
637,2.5,Low
638,0.3,High
639,0.3,Medium
640,1.3,Medium


#### Perform ANOVA Test

In [111]:
anova_result = f_oneway(
    priority_data[priority_data['Priority'] == 'High']['Days'],
    priority_data[priority_data['Priority'] == 'Medium']['Days'],
    priority_data[priority_data['Priority'] == 'Low']['Days']
)

print("ANOVA test results:")
print(f"F-statistic: {anova_result.statistic}")
print(f"P-value: {anova_result.pvalue}")

alpha = 0.05
if anova_result.pvalue < alpha:
    print("Reject null hypothesis: There is a significant difference in the mean completion times across the priority levels.")
else:
    print("Fail to reject null hypothesis: There is no significant difference in the mean completion times across the priority levels.")


ANOVA test results:
F-statistic: 1.812311010076072
P-value: 0.16411459461716182
Fail to reject null hypothesis: There is no significant difference in the mean completion times across the priority levels.


#### Post Hoc Test

In [113]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

tukey_result = pairwise_tukeyhsd(endog=priority_data['Days'], groups=priority_data['Priority'], alpha=0.05)

print(tukey_result)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
  High    Low   1.2047 0.3709 -0.8998 3.3093  False
  High Medium  -0.5236 0.6205 -1.8441 0.7968  False
   Low Medium  -1.7284 0.1452 -3.8876 0.4309  False
---------------------------------------------------


## Problem_5
#### Films_Data_Hypothesis_Testing

In [114]:
films_data = pd.read_csv('Films.csv')

In [115]:
films_data

Unnamed: 0,_rowstate_,Movie,Gender,Marital_Status,Sinage,Parking,Clean,Overall,Age,Income,Hear_About
0,0,Ferris Buellers Day Off,Female,Married,2.0,2.0,2.0,2.0,3.0,1.0,5
1,0,Ferris Buellers Day Off,Female,Single,1.0,1.0,1.0,1.0,2.0,1.0,5
2,0,Ferris Buellers Day Off,Male,Married,2.0,4.0,3.0,2.0,4.0,1.0,5
3,0,Ferris Buellers Day Off,Female,Married,1.0,3.0,2.0,2.0,4.0,1.0,5
4,0,Ferris Buellers Day Off,Female,Married,1.0,1.0,1.0,1.0,3.0,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...
325,0,Old School,2,2,1.0,2.0,1.0,1.0,2.0,1.0,1
326,0,Old School,1,1,2.0,2.0,2.0,2.0,3.0,3.0,5
327,0,Old School,2,1,2.0,1.0,1.0,2.0,2.0,2.0,5
328,0,Old School,1,1,1.0,1.0,1.0,1.0,2.0,1.0,3


#### Calculate Overall Customer Satisfaction

In [116]:
overall_satisfaction = films_data['Overall'].mean()
print(f"\nOverall Customer Satisfaction: {overall_satisfaction:.2f}")


Overall Customer Satisfaction: 1.62


####  correlations between satisfaction factors and overall satisfaction

In [117]:
satisfaction_factors = ['Sinage', 'Parking', 'Clean', 'Overall']
correlation_matrix = films_data[satisfaction_factors].corr()['Overall']
print("\nCorrelations with Overall Satisfaction:")
print(correlation_matrix)


Correlations with Overall Satisfaction:
Sinage     0.382881
Parking    0.516585
Clean      0.349412
Overall    1.000000
Name: Overall, dtype: float64


#### Demographic Profile of Patrons

In [118]:
gender_counts = films_data['Gender'].value_counts()
marital_status_counts = films_data['Marital_Status'].value_counts()
age_counts = films_data['Age'].value_counts()
income_counts = films_data['Income'].value_counts()

print("Demographic Profile:")
print("Gender Counts:")
print(gender_counts)
print("\nMarital Status Counts:")
print(marital_status_counts)
print("\nAge Counts:")
print(age_counts)
print("\nIncome Counts:")
print(income_counts)

Demographic Profile:
Gender Counts:
Gender
2         194
1         104
Female     19
Male       13
Name: count, dtype: int64

Marital Status Counts:
Marital_Status
2          208
1           88
Single      18
Married     12
Slngle       2
Name: count, dtype: int64

Age Counts:
Age
2.0    175
3.0    117
1.0     26
4.0     10
Name: count, dtype: int64

Income Counts:
Income
1.0    142
3.0     90
2.0     82
Name: count, dtype: int64


#### Effective Media Outlets for Advertising

In [119]:
media_outlets = ['Television', 'Newspaper', 'Radio', 'Website', 'Word of Mouth']
media_counts = films_data['Hear_About'].value_counts()
media_percentages = media_counts / len(films_data) * 100
print("\nMedia Outlets Effectiveness:")
for outlet, percentage in zip(media_outlets, media_percentages):
    print(f"{outlet}: {percentage:.2f}%")


Media Outlets Effectiveness:
Television: 68.48%
Newspaper: 12.42%
Radio: 6.67%
Website: 4.24%
Word of Mouth: 3.64%
