# **1- Data Collection**

**1.1- Importing File Formats**

In [None]:
import pandas as pd

# Importing CSV files
df_coded = pd.read_csv('/kaggle/input/chatbots-impact-on-university-learning/Impact of Conversational Chatbots on Learning of University Students/AI_Chatbots_Students_Attitude_Dataset_Coded_EN.csv',low_memory=False, encoding='latin-1', on_bad_lines='skip', delimiter=';')
df_en = pd.read_csv('/kaggle/input/chatbots-impact-on-university-learning/Impact of Conversational Chatbots on Learning of University Students/AI_Chatbots_Students_Attitude_Dataset_EN.csv',low_memory=False, encoding='latin-1', on_bad_lines='skip', delimiter=';')

**1.21- Checking Data Types**

In [None]:
data_types_coded = df_coded.dtypes
data_types_en = df_en.dtypes

**1.22- Checking Duplicates**

In [None]:
duplicates_coded = df_coded.duplicated().sum()
duplicates_en = df_en.duplicated().sum()

# **2- Data Exploration**

**2.1- Understanding the Structure**

**2.11- View the First Few Rows**

In [None]:
df_coded.head()

In [None]:
df_en.head()

**2.12- View DataFrame Information**

In [None]:
df_coded.info()

In [None]:
df_en.info()

**2.13- Summary Statistics**

In [None]:
df_coded.describe()

In [None]:
df_en.describe()

**2.14- View Column Names**

In [None]:
df_coded.columns

In [None]:
df_en.columns

**2.15- Shape of the Data**

In [None]:
df_coded.shape

In [None]:
df_en.shape

**2.2- Missing Values**

**2.21- Identifying Missing Values**

In [None]:
missing_values_coded = df_coded.isnull().sum()
missing_values_en = df_en.isnull().sum()

In [None]:
# Drop rows with any missing values
df_coded_cleaned = df_coded.dropna()
df_en_cleaned = df_en.dropna()

# Drop columns with any missing values
df_coded_cleaned_col = df_coded.dropna(axis=1)
df_en_cleaned_col = df_en.dropna(axis=1)

# **3- Data Cleaning**

**3.1- Identifying Outliers using Z-Score**

In [None]:
from scipy import stats

# Calculate Z-scores of each value in the DataFrame
z_scores_coded = stats.zscore(df_coded.select_dtypes(include=['float64', 'int64']))
z_scores_en = stats.zscore(df_en.select_dtypes(include=['float64', 'int64']))

# Identify outliers (Z-score > 3 or < -3)
df_outliers_coded = df_coded[(z_scores_coded > 3).any(axis=1) | (z_scores_coded < -3).any(axis=1)]
df_outliers_en = df_en[(z_scores_en > 3).any(axis=1) | (z_scores_en < -3).any(axis=1)]

**3.2- Remove Outliers**

In [None]:
# Remove rows with outliers based on Z-score
df_no_outliers_coded = df_coded[(z_scores_coded < 3).all(axis=1) & (z_scores_coded > -3).all(axis=1)]
df_no_outliers_en = df_en[(z_scores_en < 3).all(axis=1) & (z_scores_en > -3).all(axis=1)]

# **Exploring Analytics**

**What is the distribution of responses for each question (e.g., Q1)?**

In [None]:
import plotly.express as px

# Plot distribution of responses for Q1 in coded dataset
fig_coded_q1 = px.histogram(df_coded, x='Q1', title='What is the distribution of responses for Q1?')
fig_coded_q1.show()

# Plot distribution of responses for Q1 in English dataset
fig_en_q1 = px.histogram(df_en, x='Q1', title='What is the distribution of responses for Q1?')
fig_en_q1.show()

**How do students' attitudes towards chatbots differ between specific questions (e.g., Q5.1 and Q5.2)?**

In [None]:
# Grouped bar plot comparing Q5.1 and Q5.2 in coded dataset
fig_coded_q5 = px.bar(df_coded, x='Q5.1', y='Q5.2', title="How do students' attitudes differ between Q5.1 and Q5.2 in the coded dataset?", labels={'Q5.1':'Q5.1 Response', 'Q5.2':'Q5.2 Response'})
fig_coded_q5.show()

# Grouped bar plot comparing Q5.1 and Q5.2 in English dataset
fig_en_q5 = px.bar(df_en, x='Q5.1', y='Q5.2', title="How do students' attitudes differ between Q5.1 and Q5.2 in the English dataset?", labels={'Q5.1':'Q5.1 Response', 'Q5.2':'Q5.2 Response'})
fig_en_q5.show()

**Are there any patterns in the responses over time (using the Timestamp column)?**

In [None]:
# Convert Timestamp to datetime
df_coded['Timestamp'] = pd.to_datetime(df_coded['Timestamp'])
df_en['Timestamp'] = pd.to_datetime(df_en['Timestamp'])

# Line chart of Q1 responses over time in coded dataset
fig_coded_time = px.line(df_coded, x='Timestamp', y='Q1', title='Are there any patterns in Q1 responses over time in the coded dataset?', labels={'Q1': 'Q1 Response'})
fig_coded_time.show()

# Line chart of Q1 responses over time in English dataset
fig_en_time = px.line(df_en, x='Timestamp', y='Q1', title='Are there any patterns in Q1 responses over time in the English dataset?', labels={'Q1': 'Q1 Response'})
fig_en_time.show()

**What is the distribution of responses for multiple choice questions (e.g., Q6.1 to Q6.8)?**

In [None]:
# Melt the dataframe to long format for multiple choice questions
df_coded_melted = df_coded.melt(id_vars=['Timestamp'], value_vars=['Q6.1', 'Q6.2', 'Q6.3', 'Q6.4', 'Q6.5', 'Q6.6', 'Q6.7', 'Q6.8'], var_name='Question', value_name='Response')
df_en_melted = df_en.melt(id_vars=['Timestamp'], value_vars=['Q6.1', 'Q6.2', 'Q6.3', 'Q6.4', 'Q6.5', 'Q6.6', 'Q6.7', 'Q6.8'], var_name='Question', value_name='Response')

# Plot the distribution of responses for multiple questions in coded dataset
fig_coded_multichoice = px.histogram(df_coded_melted, x='Question', color='Response', title='What is the distribution of responses for multiple choice questions in the coded dataset?')
fig_coded_multichoice.show()

# Plot the distribution of responses for multiple questions in English dataset
fig_en_multichoice = px.histogram(df_en_melted, x='Question', color='Response', title='What is the distribution of responses for multiple choice questions in the English dataset?')
fig_en_multichoice.show()

**What is the relationship between responses to questions Q9.1 and Q9.2?**

In [None]:
# Scatter plot comparing Q9.1 and Q9.2 in coded dataset
fig_coded_q9 = px.scatter(df_coded, x='Q9.1', y='Q9.2', title="What is the relationship between responses to Q9.1 and Q9.2 in the coded dataset?")
fig_coded_q9.show()

# Scatter plot comparing Q9.1 and Q9.2 in English dataset
fig_en_q9 = px.scatter(df_en, x='Q9.1', y='Q9.2', title="What is the relationship between responses to Q9.1 and Q9.2 in the English dataset?")
fig_en_q9.show()

**How do responses to all questions differ across categorical variables (e.g., comparing Q1 to Q3)?**

In [None]:
# Grouped bar chart for comparing Q1 and Q3 in coded dataset
fig_coded_q1_q3 = px.bar(df_coded, x='Q1', y='Q3', title="How do responses to Q1 differ from Q3 in the coded dataset?", labels={'Q1':'Q1 Response', 'Q3':'Q3 Response'})
fig_coded_q1_q3.show()

# Grouped bar chart for comparing Q1 and Q3 in English dataset
fig_en_q1_q3 = px.bar(df_en, x='Q1', y='Q3', title="How do responses to Q1 differ from Q3 in the English dataset?", labels={'Q1':'Q1 Response', 'Q3':'Q3 Response'})
fig_en_q1_q3.show()

**How does the response to Q8.1 differ across other related variables (e.g., Q8.2, Q8.3)?**

In [None]:
# Scatter plot matrix for Q8.1, Q8.2, Q8.3 in coded dataset
fig_coded_q8 = px.scatter_matrix(df_coded, dimensions=['Q8.1', 'Q8.2', 'Q8.3'], title="How do responses to Q8.1, Q8.2, and Q8.3 relate in the coded dataset?")
fig_coded_q8.show()

# Scatter plot matrix for Q8.1, Q8.2, Q8.3 in English dataset
fig_en_q8 = px.scatter_matrix(df_en, dimensions=['Q8.1', 'Q8.2', 'Q8.3'], title="How do responses to Q8.1, Q8.2, and Q8.3 relate in the English dataset?")
fig_en_q8.show()

**Are there any patterns when analyzing the responses to Q6.1 to Q6.8?**

In [None]:
# scatter plot for Q6.1 to Q6.8 responses in coded dataset
fig_coded_q6 = px.scatter(df_coded, x='Timestamp', y=['Q6.1', 'Q6.2', 'Q6.3', 'Q6.4', 'Q6.5', 'Q6.6', 'Q6.7', 'Q6.8'], title="What are the patterns in responses to Q6.1 to Q6.8 in the coded dataset?")
fig_coded_q6.show()

# scatter plot for Q6.1 to Q6.8 responses in English dataset
fig_en_q6 = px.scatter(df_en, x='Timestamp', y=['Q6.1', 'Q6.2', 'Q6.3', 'Q6.4', 'Q6.5', 'Q6.6', 'Q6.7', 'Q6.8'], title="What are the patterns in responses to Q6.1 to Q6.8 in the English dataset?")
fig_en_q6.show()

**Do responses to Q9.1 and Q9.5 show any trends over time?**

In [None]:
# scatter chart of Q9.1 and Q9.5 over time in coded dataset
fig_coded_q9_time = px.scatter(df_coded, x='Timestamp', y=['Q9.1', 'Q9.5'], title="Do responses to Q9.1 and Q9.5 show any trends over time in the coded dataset?", labels={'Q9.1':'Q9.1 Response', 'Q9.5':'Q9.5 Response'})
fig_coded_q9_time.show()

# scatter chart of Q9.1 and Q9.5 over time in English dataset
fig_en_q9_time = px.scatter(df_en, x='Timestamp', y=['Q9.1', 'Q9.5'], title="Do responses to Q9.1 and Q9.5 show any trends over time in the English dataset?", labels={'Q9.1':'Q9.1 Response', 'Q9.5':'Q9.5 Response'})
fig_en_q9_time.show()

**How does the distribution of Q3 responses compare to other related questions (e.g., Q2, Q4)?**

In [None]:
# Distribution plot comparing Q3, Q2, and Q4 responses in coded dataset
fig_coded_q3_dist = px.histogram(df_coded, x='Q3', color='Q4', marginal='rug', title="How does the distribution of Q3 responses compare to Q2 and Q4 in the coded dataset?")
fig_coded_q3_dist.show()

# Distribution plot comparing Q3, Q2, and Q4 responses in English dataset
fig_en_q3_dist = px.histogram(df_en, x='Q3', color='Q4', marginal='rug', title="How does the distribution of Q3 responses compare to Q2 and Q4 in the English dataset?")
fig_en_q3_dist.show()

**What are the overall trends when comparing responses to Q7.1, Q7.2, Q7.3, and Q7.4?**

In [None]:
# scatter plot comparing Q7.1 to Q7.4 in coded dataset
fig_coded_q7 = px.scatter(df_coded, x='Timestamp', y=['Q7.1', 'Q7.2', 'Q7.3', 'Q7.4'], title="What are the overall trends for Q7.1 to Q7.4 in the coded dataset?")
fig_coded_q7.show()

# scatter plot comparing Q7.1 to Q7.4 in English dataset
fig_en_q7 = px.scatter(df_en, x='Timestamp', y=['Q7.1', 'Q7.2', 'Q7.3', 'Q7.4'], title="What are the overall trends for Q7.1 to Q7.4 in the English dataset?")
fig_en_q7.show()

**How do students' attitudes change from Q5.4 to Q5.6?**

In [None]:
# scatter chart to observe changes in responses from Q5.4 to Q5.6 in coded dataset
fig_coded_q5_changes = px.scatter(df_coded, x='Timestamp', y=['Q5.4', 'Q5.5', 'Q5.6'], title="How do students' attitudes change from Q5.4 to Q5.6 in the coded dataset?")
fig_coded_q5_changes.show()

# scatter chart to observe changes in responses from Q5.4 to Q5.6 in English dataset
fig_en_q5_changes = px.scatter(df_en, x='Timestamp', y=['Q5.4', 'Q5.5', 'Q5.6'], title="How do students' attitudes change from Q5.4 to Q5.6 in the English dataset?")
fig_en_q5_changes.show()


**Are there any visible time-based trends for responses to Q5.1 and Q5.6?**

In [None]:
# scatter chart to observe time-based trends for Q5.1 and Q5.6 in coded dataset
fig_coded_q5_time = px.scatter(df_coded, x='Timestamp', y=['Q5.1', 'Q5.6'], title="Are there any time-based trends for Q5.1 and Q5.6 responses in the coded dataset?")
fig_coded_q5_time.show()

# scatter chart to observe time-based trends for Q5.1 and Q5.6 in English dataset
fig_en_q5_time = px.scatter(df_en, x='Timestamp', y=['Q5.1', 'Q5.6'], title="Are there any time-based trends for Q5.1 and Q5.6 responses in the English dataset?")
fig_en_q5_time.show()


**How do responses to Q7.1 differ across the overall dataset?**

In [None]:
# Histogram for Q7.1 responses in coded dataset
fig_coded_q7_dist = px.histogram(df_coded, x='Q7.1', title="How do responses to Q7.1 differ across the coded dataset?", nbins=20)
fig_coded_q7_dist.show()

# Histogram for Q7.1 responses in English dataset
fig_en_q7_dist = px.histogram(df_en, x='Q7.1', title="How do responses to Q7.1 differ across the English dataset?", nbins=20)
fig_en_q7_dist.show()

**What are the general trends in responses across all Q6 questions over time?**

In [None]:
# scatter chart for all Q6 questions over time in coded dataset
fig_coded_q6_all = px.scatter(df_coded, x='Timestamp', y=['Q6.1', 'Q6.2', 'Q6.3', 'Q6.4', 'Q6.5', 'Q6.6', 'Q6.7', 'Q6.8'], title="What are the general trends in responses across all Q6 questions over time in the coded dataset?")
fig_coded_q6_all.show()

# scatter chart for all Q6 questions over time in English dataset
fig_en_q6_all = px.scatter(df_en, x='Timestamp', y=['Q6.1', 'Q6.2', 'Q6.3', 'Q6.4', 'Q6.5', 'Q6.6', 'Q6.7', 'Q6.8'], title="What are the general trends in responses across all Q6 questions over time in the English dataset?")
fig_en_q6_all.show()

**What is the distribution of responses for Q2 in coded and English datasets?**

In [None]:
# Pie chart for Q2 responses in coded dataset
fig_coded_q2_pie = px.pie(df_coded, names='Q2', title="What is the distribution of responses for Q2 in the coded dataset?")
fig_coded_q2_pie.show()

# Pie chart for Q2 responses in English dataset
fig_en_q2_pie = px.pie(df_en, names='Q2', title="What is the distribution of responses for Q2 in the English dataset?")
fig_en_q2_pie.show()

**How are the responses distributed for Q3 in coded vs English datasets?**

In [None]:
# Pie chart for Q3 responses in coded dataset
fig_coded_q3_pie = px.pie(df_coded, names='Q3', title="How are the responses distributed for Q3 in the coded dataset?")
fig_coded_q3_pie.show()

# Pie chart for Q3 responses in English dataset
fig_en_q3_pie = px.pie(df_en, names='Q3', title="How are the responses distributed for Q3 in the English dataset?")
fig_en_q3_pie.show()

**What is the percentage breakdown of responses for Q4 in both datasets?**

In [None]:
# Pie chart for Q4 responses in coded dataset
fig_coded_q4_pie = px.pie(df_coded, names='Q4', title="What is the percentage breakdown of responses for Q4 in the coded dataset?")
fig_coded_q4_pie.show()

# Pie chart for Q4 responses in English dataset
fig_en_q4_pie = px.pie(df_en, names='Q4', title="What is the percentage breakdown of responses for Q4 in the English dataset?")
fig_en_q4_pie.show()

**What are the proportions of responses to Q5.1 and Q5.3 in both datasets?**

In [None]:
# Pie chart for Q5.1 in coded dataset
fig_coded_q5_1_pie = px.pie(df_coded, names='Q5.1', title="What are the proportions of responses to Q5.1 in the coded dataset?")
fig_coded_q5_1_pie.show()

# Pie chart for Q5.1 in English dataset
fig_en_q5_1_pie = px.pie(df_en, names='Q5.1', title="What are the proportions of responses to Q5.1 in the English dataset?")
fig_en_q5_1_pie.show()

# Pie chart for Q5.3 in coded dataset
fig_coded_q5_3_pie = px.pie(df_coded, names='Q5.3', title="What are the proportions of responses to Q5.3 in the coded dataset?")
fig_coded_q5_3_pie.show()

# Pie chart for Q5.3 in English dataset
fig_en_q5_3_pie = px.pie(df_en, names='Q5.3', title="What are the proportions of responses to Q5.3 in the English dataset?")
fig_en_q5_3_pie.show()

**How do the responses to Q6.1 and Q6.3 break down?**

In [None]:
# Pie chart for Q6.1 responses in coded dataset
fig_coded_q6_1_pie = px.pie(df_coded, names='Q6.1', title="How do the responses to Q6.1 break down in the coded dataset?")
fig_coded_q6_1_pie.show()

# Pie chart for Q6.1 responses in English dataset
fig_en_q6_1_pie = px.pie(df_en, names='Q6.1', title="How do the responses to Q6.1 break down in the English dataset?")
fig_en_q6_1_pie.show()

# Pie chart for Q6.3 responses in coded dataset
fig_coded_q6_3_pie = px.pie(df_coded, names='Q6.3', title="How do the responses to Q6.3 break down in the coded dataset?")
fig_coded_q6_3_pie.show()

# Pie chart for Q6.3 responses in English dataset
fig_en_q6_3_pie = px.pie(df_en, names='Q6.3', title="How do the responses to Q6.3 break down in the English dataset?")
fig_en_q6_3_pie.show()

**What is the proportion of responses for Q7.2 in each dataset?**

In [None]:
# Pie chart for Q7.2 responses in coded dataset
fig_coded_q7_2_pie = px.pie(df_coded, names='Q7.2', title="What is the proportion of responses for Q7.2 in the coded dataset?")
fig_coded_q7_2_pie.show()

# Pie chart for Q7.2 responses in English dataset
fig_en_q7_2_pie = px.pie(df_en, names='Q7.2', title="What is the proportion of responses for Q7.2 in the English dataset?")
fig_en_q7_2_pie.show()

**What is the percentage breakdown of responses to Q8.2 in both datasets?**

In [None]:
# Pie chart for Q8.2 responses in coded dataset
fig_coded_q8_2_pie = px.pie(df_coded, names='Q8.2', title="What is the percentage breakdown of responses to Q8.2 in the coded dataset?")
fig_coded_q8_2_pie.show()

# Pie chart for Q8.2 responses in English dataset
fig_en_q8_2_pie = px.pie(df_en, names='Q8.2', title="What is the percentage breakdown of responses to Q8.2 in the English dataset?")
fig_en_q8_2_pie.show()

**What is the distribution of responses for Q9.4 across all participants?**

In [None]:
# Pie chart for Q9.4 responses in coded dataset
fig_coded_q9_4_pie = px.pie(df_coded, names='Q9.4', title="What is the distribution of responses for Q9.4 across all participants in the coded dataset?")
fig_coded_q9_4_pie.show()

# Pie chart for Q9.4 responses in English dataset
fig_en_q9_4_pie = px.pie(df_en, names='Q9.4', title="What is the distribution of responses for Q9.4 across all participants in the English dataset?")
fig_en_q9_4_pie.show()