In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load data
data = pd.read_csv('data.csv')

# Display the first few rows of the DataFrame
print(data.head())

# Get information about the DataFrame
print(data.info())

# Generate summary statistics
print(data.describe())

# Handle missing values
data.dropna(inplace=True)

# Remove duplicates
data.drop_duplicates(inplace=True)

# Fill missing values with mean
data.fillna(data.mean(), inplace=True)

# Create a histogram
plt.hist(data['column_name'], bins=10)
plt.xlabel('X-axis label')
plt.ylabel('Y-axis label')
plt.title('Title of the Histogram')
plt.show()

# Create a scatter plot
plt.scatter(data['x'], data['y'])
plt.xlabel('X-axis label')
plt.ylabel('Y-axis label')
plt.title('Title of the Scatter Plot')
plt.show()

# Compute summary statistics
mean_value = data['column_name'].mean()
median_value = data['column_name'].median()

print("Mean:", mean_value)
print("Median:", median_value)

# Calculate correlation
correlation = data['column1'].corr(data['column2'])
print("Correlation:", correlation)

# Perform hypothesis test
from scipy.stats import ttest_ind

group1 = data[data['group'] == 'A']['column_name']
group2 = data[data['group'] == 'B']['column_name']

t_statistic, p_value = ttest_ind(group1, group2)
print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

# Apply machine learning algorithms
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = data[['feature1', 'feature2']]
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print("Predictions:", predictions)
