### Import Libraries

In [None]:
import mysql.connector  
import pandas as pd  
import matplotlib.pyplot as plt  
import seaborn as sns  
import warnings

warnings.filterwarnings('ignore')


### Connecting to Database

In [None]:
# Establishing a connection to the MySQL database
connection = mysql.connector.connect(host = 'localhost',
                                     user = 'root',
                                     password = 'Ran4taryn$',
                                     database = 'takeatech')

# Running an SQL query to retrieve all the table names from the connected database
tech_tables = pd.read_sql_query('SHOW TABLES', connection)

# Displaying the list of tables in the 'techtrendpro' database
tech_tables

### Extracting Data from SQL

In [None]:
# Running an SQL query to retrieve all the data from the 'feedback_data' table
feedback = pd.read_sql_query('SELECT * FROM feedback_data', connection)

# Displaying the first 5 rows of the data fetched from the 'feedback_data' table
feedback.head()


In [None]:
# Running an SQL query to retrieve all the data from the 'product_data' table
product = pd.read_sql_query('SELECT * FROM product_data', connection)

# Displaying the first 5 rows of the data fetched from the 'product_data' table
product.head()


In [None]:
# Running an SQL query to retrieve all the data from the 'sales_data' table
sales = pd.read_sql_query('SELECT * FROM sales_data', connection)

# Displaying the first 5 rows of the data fetched from the 'sales_data' table
sales.head()


### Inspecting Headers

In [None]:
# Running an SQL query to retrieve the names and details of the columns in the 'feedback_data' table
feedback_headers = pd.read_sql_query('SHOW COLUMNS FROM feedback_data', connection)

# Displaying the column names and their details from the 'feedback_data' table
feedback_headers


In [None]:
# Running an SQL query to retrieve the names and details of the columns in the 'product_data' table
product_headers = pd.read_sql_query('SHOW COLUMNS FROM product_data', connection)

# Displaying the column names and their details from the 'product_data' table
product_headers


In [None]:
# Running an SQL query to retrieve the names and details of the columns in the 'sales_data' table
sales_headers = pd.read_sql_query('SHOW COLUMNS FROM sales_data', connection)

# Displaying the column names and their details from the 'sales_data' table
sales_headers


### Descriptive Analysis

In [None]:
# Generating descriptive statistics for all columns in the 'feedback' dataset
feedback_stats = feedback.describe(include='all')

# Displaying the generated statistics
feedback_stats


In [None]:
# Generating descriptive statistics for all columns in the 'product' dataset
product_stats = product.describe(include='all')

# Displaying the generated statistics
product_stats


In [None]:
# Generating descriptive statistics for all columns in the 'sales' dataset
sales_stats = sales.describe(include='all')

# Displaying the generated statistics
sales_stats


### Joining Data

In [None]:
# Aggregate the sales data to prevent multiple rows for the same product
sales_aggregated = sales.groupby('productid').agg({
    'saleid': 'first',  
    'salesdate': 'first',
    'salesvolume': 'sum',  
    'revenuegenerated': 'sum'  
}).reset_index()

# Merge sales_aggregated with product
salesproduct_merged = pd.merge(sales_aggregated, product, on='productid', how='inner')

# Merge feedback with the merged salesproduct_merged
all_data = pd.merge(feedback, salesproduct_merged, on='productid', how='inner')

# Optionally drop duplicates if needed
#all_data.drop_duplicates(inplace=True)


In [None]:
df = all_data.to_csv('joined_data.csv', index=False)

In [None]:
all_data.head(10)

In [None]:
all_data.info()

In [None]:
productreturn_rate = (all_data['feedbackid'].count()*100)/(all_data['salesvolume'].sum())
print('Sales Volume:',all_data['salesvolume'].sum(),'\n','Return Volume:',all_data['feedbackid'].count(),'\n','Return Rate: ',round( productreturn_rate,2))

### Exploratory Data Analysis (EDA) - Univariate

In [None]:
plt.style.use('classic')

# Setting up a figure with 2 rows and 2 columns for subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 15), gridspec_kw={'hspace': 0.4})

# Custom colors
custom_colors = ['#0072B2', '#E69F00', '#009E73', '#CC79A7']

# Sales Volume
sns.histplot(all_data['salesvolume'],kde='True', ax=axes[0,0], color=custom_colors[3])
axes[0,0].set_title('Distribution of Sales Volme', fontsize=14)
axes[0,0].set_xlabel('Sales Volume',fontsize=12)
axes[0,0].set_ylabel('Frequency',fontsize=12)

# Revenue Generated
sns.histplot(all_data['revenuegenerated'],kde='True', ax=axes[0,1], color=custom_colors[3])
axes[0,1].set_title('Distribution of Revenue Generated', fontsize=14)
axes[0,1].set_xlabel('Revenue Generated',fontsize=12)
axes[0,1].set_ylabel('Frequency',fontsize=12)

# Sales Volume by Product Category
category_sales = all_data.groupby('productcategory')['salesvolume'].sum()
category_sales.plot(kind='bar',ax=axes[1,0], color=custom_colors[3], width=0.4)#(category_sales['productcategory'], category_sales['salesvolume'],ax=axes[1,0])
axes[1,0].set_title('Sales Volume by Product Category')
axes[1,0].set_xlabel('Product Category')
axes[1,0].set_ylabel('Sales')
axes[1,0].tick_params(axis='x',rotation=360)

# Return Reason
return_reason_counts = all_data['returnreason'].value_counts()
return_reason_counts.plot(kind='bar', ax=axes[1,1], color=custom_colors[3], width=0.7)
axes[1,1].set_title('Return Reasons', fontsize=14)
axes[1,1].set_xlabel('Return Reason',fontsize=12)
axes[1,1].set_ylabel('Count',fontsize=12)
axes[1,1].tick_params(axis='x',rotation=45)

# Adjust layout
fig.suptitle('Univariate Analysis', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


### Exploratory Data Analysis (EDA) - Bivariate

In [None]:
# Create a crosstab to count occurrences of each combination
stacked_data = pd.crosstab(all_data['returnreason'], all_data['productcategory'])

# Sort columns by the sum of each column in descending order
stacked_data = stacked_data[stacked_data.sum(axis=0).sort_values(ascending=False).index]

# Define your custom colors
custom_colors = ['#0072B2', '#009E73', '#CC79A7']

# Plotting the stacked bar chart
stacked_data.plot(kind='barh', stacked=True, figsize=(15, 10), color=custom_colors)#, fontsize=12)

# Set title and labels
plt.title('Return Reason By Product Category')#, fontsize=16)
plt.xlabel('Count of Returns')#, fontsize=14)
plt.ylabel('Return Reason')#, fontsize=14)

# Show the legend outside the plot
plt.legend(title='Product Category', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout
plt.suptitle('Bivariate Analysis', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

### Analysis

In [None]:
# Extracting insights from 'Customer Feedback' related to product quality
# We assume feedback mentioning terms like 'defective', 'poor quality', 'not as described' indicate quality issues
quality_related_terms = ['defective',
                        'poor quality',
                        'disappointing',
                        'poor',
                        'malfunctioning',
                        'faulty',
                        'subpar',
                        'unsatisfactory',
                        'unreliable',
                        'underwhelming',
                        'limited functionality',
                        'performance issues',
                        'software bugs',
                        'hardware failure',
                        'glitchy',
                        'inconsistent',
                        'slow',
                        'overheating',
                        'not worth the price',
                        'broke after a few uses',
                        'short lifespan',
                        'low battery life',
                        'not responsive',
                        'poor sound quality',
                        'screen issues',
                        'connectivity problems'
]


# Adding a new column 'Quality Issue' to indicate if the feedback suggests a quality issue
all_data['qualityissue'] = all_data['customerfeedback'].apply(
    lambda x: any(term in x.lower() for term in quality_related_terms)
)

# Analyzing the frequency of returns due to quality issues
quality_issue_returns = all_data['qualityissue'].value_counts()

# Visualizing the relationship between product category and quality issues
plt.figure(figsize=(15, 10))
custom_colors = ['#009E73', '#CC79A7']
sns.countplot(x='productcategory', hue='qualityissue', data=all_data, width=0.7, palette=custom_colors)

# Adjust layout
plt.title('Quality Issues by Product Category', fontsize=16)
plt.ylabel('Count of Quality Issues', fontsize=14)
plt.xlabel('Product Category', fontsize=14)
plt.legend(title='Quality Issues', bbox_to_anchor=(1.05, 1), loc='upper left')



plt.show()

In [None]:
all_data.groupby('productcategory')['qualityissue'].value_counts()


In [None]:
# To statistically ascertain the relationship between quality issues and return rates,
from scipy.stats import chi2_contingency
import matplotlib.colors as mcolors

# Preparing a contingency table for the chi-square test
contingency_table = pd.crosstab(all_data['productcategory'], all_data['qualityissue'])

# Performing the chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Results of the chi-square test
chi2_test_result = {
    "Chi-Square Statistic": chi2,
    "p-value": p,
    "Degrees of Freedom": dof,
    "Expected Frequencies": expected
}

In [None]:
quality_issue_returns

In [None]:
chi2_test_result

## Visualisation

In [None]:

# Preparing a contingency table for the chi-square test
contingency_table = pd.crosstab(all_data['productcategory'], all_data['qualityissue'])

# Performing the chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Results of the chi-square test
chi2_test_result = {
    "Chi-Square Statistic": chi2,
    "p-value": p,
    "Degrees of Freedom": dof,
    "Expected Frequencies": expected
}  # Fixed missing closing brace

# Convert the expected frequencies to a DataFrame for easy plotting
expected_df = pd.DataFrame(expected,
                           index=contingency_table.index,
                           columns=contingency_table.columns)

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(15, 10))

custom_cmap = mcolors.LinearSegmentedColormap.from_list("custom_cmap", ['#0072B2', '#E69F00', '#009E73', '#CC79A7'])

# Heatmap for Observed Frequencies
sns.heatmap(contingency_table, annot=True, annot_kws={"color": "black"}, cmap=custom_cmap, ax=ax[0], fmt='g')  # Using the custom colormap
ax[0].set_title('Observed Frequencies')
ax[0].set_xlabel('Quality Issues')
ax[0].set_ylabel('Product Category')

# Heatmap for Expected Frequencies
sns.heatmap(expected_df, annot=True, annot_kws={"color": "black"},cmap=custom_cmap, ax=ax[1], fmt='g')  # Corrected to use expected_df
ax[1].set_title('Expected Frequencies')
ax[1].set_xlabel('Quality Issues')
ax[1].set_ylabel('Product Category')

plt.tight_layout()
plt.show()


##### Quality Issue Analysis
###### Frequency of Returns Due to Quality Issues:
- There are 2,717 instances where the customer feedback suggests a quality issue.
- 12,404 instances do not indicate a quality issue in customer feedback.
###### Visualization: Quality Issues by Product Category

- The chart shows the count of quality-related issues across different product categories.

##### Statistical Analysis: Chi-Square Test of Independence
- Chi-Square Statistic: 6.559
- p-value: 0.038
- Degrees of Freedom: 2

##### Expected Frequencies:
- Accessories: Expected frequency of returns with and without quality issues - 4543.73 and 995.27, respectively.
- Smartphones: Expected frequency - 3620.06 and 792.94.
- Laptops: Expected frequency - 4240.21 and 928.79.

###### Interpretation
- Statistical Significance: The p-value of 0.038 suggests that there is a statistically significant relationship between product category and the presence of quality issues in the customer feedback.

##### Implications:
- This implies that the quality issues leading to returns are not uniformly distributed across product categories.
- Given this information, TechTrend Pro can focus on the categories with higher rates of quality-related issues, potentially addressing specific aspects of product quality in those categories to reduce return rates.

### Insights

##### Quality-Related Returns Vary Across Categories:
- A significant proportion of returns are linked to quality issues, especially in certain product categories.
- The chi-square test indicated a statistically significant relationship between product categories and quality issues.

##### Specific Categories with Higher Quality Issues:
- Among the product categories, some show a higher prevalence of quality-related feedback. These categories warrant closer inspection and targeted improvements.

#### Customer Feedback as a Quality Indicator:
- Customer feedback frequently mentions quality issues like 'defective products' or 'poor quality', highlighting the importance of addressing these concerns.

### Recommendations
##### Enhanced Quality Control for Targeted Categories:
- TechTrend Pro should implement stricter quality control measures, especially for the product categories with a higher incidence of quality-related returns. This could involve more rigorous testing and inspection processes.

##### Review and Strengthen Supplier Relationships:
- Analyze and review supplier performance. For suppliers consistently linked to quality issues, consider renegotiation of terms, additional quality checks, or seeking alternative suppliers.

##### Leverage Customer Feedback for Product Improvements:
- Utilize the insights from customer feedback to inform product development. Identify common complaints and address these in future product designs or updates.

##### Enhance Customer Education and Support:
- Some returns may be due to misunderstandings about product use. Providing better educational materials and proactive customer support could reduce such returns.

##### Regular Data Analysis for Continuous Improvement:
- Continue to analyze sales, feedback, and return data regularly to identify emerging trends or new issues, allowing for timely corrective actions.

##### Invest in Predictive Analytics:
- Implement predictive analytics to identify potential quality issues before products are shipped. This could help in proactively addressing problems and reducing future returns.

### Conclusion
By focusing on these areas, TechTrend Pro can potentially reduce return rates, leading to increased customer satisfaction and reduced operational costs associated with handling returns. The key is a combination of proactive quality management, continuous data monitoring, and leveraging customer feedback for ongoing improvements.