In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Assignment5


## Ground Cricket Chirps

In _The Song of Insects_ (1948) by George W. Pierce, Pierce mechanically measured the frequency (the number of wing vibrations per second) of chirps (or pulses of sound) made by a striped ground cricket, at various ground temperatures.  Since crickets are ectotherms (cold-blooded), the rate of their physiological processes and their overall metabolism are influenced by temperature.  Consequently, there is reason to believe that temperature would have a profound effect on aspects of their behavior, such as chirp frequency.

In general, it was found that crickets did not sing at temperatures colder than 60ยบ F. or warmer than 100ยบ F.

In [None]:
ground_cricket_data = {"Ground Temperature": [88.6, 71.6, 93.3, 84.3, 80.6, 75.2, 69.7,
                                              71.6, 69.4, 83.3, 79.6, 82.6, 80.6, 83.5,
                                              76.3],
                       "Chirps/Second": [20.0, 16.0, 19.8, 18.4, 17.1, 15.5, 14.7,
                                         15.7, 15.4, 16.3, 15.0, 17.2, 16.0, 17.0,
                                         14.4]}
df = pd.DataFrame(ground_cricket_data)

### Tasks

1. Find the linear regression equation for this data.
2. Chart the original data and the equation on the chart.
3. Find the equation's $R^2$ score (use the `.score` method) to determine whether the
equation is a good fit for this data. (0.8 and greater is considered a strong correlation.)
4. Extrapolate data:  If the ground temperature reached 95, then at what approximate rate would you expect the crickets to be chirping?
5. Interpolate data:  With a listening device, you discovered that on a particular morning the crickets were chirping at a rate of 18 chirps per second.  What was the approximate ground temperature that morning? 

In [None]:
df.head()

Unnamed: 0,Ground Temperature,Chirps/Second
0,88.6,20.0
1,71.6,16.0
2,93.3,19.8
3,84.3,18.4
4,80.6,17.1


In [None]:
X = df.iloc[:,0]
y = df.iloc[:,1]

In [None]:
X = np.array(X).reshape((-1,1))
y = np.array(y).reshape((-1,1))

In [None]:
model1 = LinearRegression()
model1.fit(X,y)

LinearRegression()

In [None]:
model1.coef_

array([[0.20299973]])

In [None]:
model1.intercept_

array([0.45931465])

In [None]:
print("y = ",float(model1.coef_),"x +",float(model1.intercept_))

y =  0.20299973140771363 x + 0.45931464523595267


In [None]:
pred = model1.predict(X)

In [None]:
pred

array([[18.44509085],
       [14.99409541],
       [19.39918959],
       [17.572192  ],
       [16.821093  ],
       [15.72489445],
       [14.60839592],
       [14.99409541],
       [14.547496  ],
       [17.36919227],
       [16.61809327],
       [17.22709246],
       [16.821093  ],
       [17.40979222],
       [15.94819415]])

In [None]:
model1.score(X,y)

0.6922946529146998

In [None]:
model1.predict([[95]])

array([[19.74428913]])

In [None]:
new_model = LinearRegression()
new_model.fit(y,X)

LinearRegression()

In [None]:
pred1 = new_model.predict(y)
pred1

array([[91.0554423],
       [77.4141503],
       [90.3733777],
       [85.5989255],
       [81.1655056],
       [75.7089888],
       [72.9807304],
       [76.3910534],
       [75.3679565],
       [78.4372472],
       [74.0038273],
       [81.5065379],
       [77.4141503],
       [80.8244733],
       [71.9576335]])

In [None]:
new_model.predict([[18]])

array([[84.2347963]])

# Assignment6

## Brain vs. Body Weight

In the file `brain_body.txt`, the average brain and body weight for a number of mammal species are recorded. Load this data into a Pandas data frame.

### Tasks

1. Find the linear regression equation for this data for brain weight to body weight.
2. Chart the original data and the equation on the chart.
3. Find the equation's $R^2$ score (use the `.score` method) to determine whether the
equation is a good fit for this data. (0.8 and greater is considered a strong correlation.)

In [None]:
df = pd.read_fwf("brain_body.txt")

In [None]:
df.head()

Unnamed: 0,Brain,Body
0,3.385,44.5
1,0.48,15.5
2,1.35,8.1
3,465.0,423.0
4,36.33,119.5


In [None]:
X = df.iloc[:,0]
y = df.iloc[:,1]

In [None]:
X = np.array(X).reshape((-1,1))
y = np.array(y).reshape((-1,1))

In [None]:
model = LinearRegression()
model.fit(X,y)

LinearRegression()

In [None]:
print(model.coef_)
print(model.intercept_)

[[0.96649637]]
[91.00439621]


In [None]:
print("y = ",float(model.coef_),"x +",float(model.intercept_))

y =  0.9664963676725759 x + 91.00439620740687


In [None]:
preds = model.predict(X)
preds

array([[  94.27598641],
       [  91.46831446],
       [  92.3091663 ],
       [ 540.42520718],
       [ 126.11720924],
       [ 117.73768574],
       [ 105.33753734],
       [  92.00955243],
       [  95.05401599],
       [  91.41515716],
       [  91.10201234],
       [  91.89357287],
       [  91.97089258],
       [  91.00922869],
       [  91.06238599],
       [  94.38713349],
       [  92.93738894],
       [  92.64744003],
       [2552.67064467],
       [  91.02662562],
       [ 271.8358666 ],
       [ 594.54900376],
       [  91.76309586],
       [ 100.66935988],
       [  94.19383422],
       [  91.19769548],
       [  92.36715609],
       [ 602.28097471],
       [ 291.06914432],
       [ 173.15658746],
       [  91.72926848],
       [ 150.927171  ],
       [6522.0712267 ],
       [  94.38713349],
       [  97.57657151],
       [ 124.83176908],
       [  94.9187065 ],
       [  91.12037577],
       [  91.02662562],
       [  91.01406117],
       [  92.35749112],
       [ 332.628

In [None]:
model.score(y,preds)

0.8536290611370148

# Assignment7

## Salary Discrimination

The file `salary.txt` contains data for 52 tenure-track professors at a small Midwestern college. This data was used in legal proceedings in the 1980s about discrimination against women in salary.

The data in the file, by column:

1. Sex. 1 for female, 0 for male.
2. Rank. 1 for assistant professor, 2 for associate professor, 3 for full professor.
3. Year. Number of years in current rank.
4. Degree. Highest degree. 1 for doctorate, 0 for master's.
5. YSdeg. Years since highest degree was earned.
6. Salary. Salary/year in dollars.

### Tasks

1. Find the linear regression equation for this data using columns 1-5 to column 6.
2. Find the selection of columns with the best $R^2$ score.
3. Report whether sex is a factor in salary.

In [None]:
from sklearn.decomposition import PCA

In [None]:
df = pd.read_fwf("salary.txt", header=None, 
                 names=["Sex", "Rank", "Year", "Degree", "YSdeg", "Salary"])

In [None]:
df.head()

Unnamed: 0,Sex,Rank,Year,Degree,YSdeg,Salary
0,0,3,25,1,35,36350
1,0,3,13,1,22,35350
2,0,3,10,1,23,28200
3,1,3,7,1,27,26775
4,0,3,19,0,30,33696


In [None]:
df.shape

(52, 6)

In [None]:
df.isnull().sum()

Sex       0
Rank      0
Year      0
Degree    0
YSdeg     0
Salary    0
dtype: int64

In [None]:
X = df.iloc[:,0:5]
y = df.iloc[:,5]

In [None]:
X = np.array(X).reshape((-1,5))
y = np.array(y).reshape((-1,1))

In [None]:
model2 = LinearRegression()
model2.fit(X,y)

LinearRegression()

In [None]:
print(model2.coef_)
print(model2.intercept_)

[[ 1241.7924996   5586.18144952   482.85976783 -1331.64406341
   -128.79057354]]
[11410.14654726]


In [None]:
print("y = ",float(model2.coef_[0][0]),"x1 +",float(model2.coef_[0][1]),"x2 +",float(model2.coef_[0][2]),"x3 +",float(model2.coef_[0][3]),"x4 +",float(model2.coef_[0][4]),"x5 +",float(model2.intercept_))

y =  1241.792499601427 x1 + 5586.1814495214385 x2 + 482.85976782882153 x3 + -1331.6440634059095 x4 + -128.79057354486122 x5 + 11410.14654725559


In [None]:
preds2 = model2.predict(X)

In [None]:
r2_score(y, preds2)

0.8547180674410969

In [None]:
l = []
for i in range(1,6):
  pca = PCA(n_components=i,random_state=0)
  pca.fit(X)
  x=pca.transform(X)

  model3 = LinearRegression()
  model3.fit(x,y)

  preds3 = model3.predict(x)

  l.append(r2_score(y, preds3))
  print("With",i,"columns:",l[i-1])

With 1 columns: 0.5186076966091977
With 2 columns: 0.5806435473452343
With 3 columns: 0.8187449442374395
With 4 columns: 0.8189289974902678
With 5 columns: 0.8547180674410969


In [None]:
X1 = df.iloc[:,1:5]
y1 = df.iloc[:,5]

In [None]:
X1 = np.array(X).reshape((-1,4))
y1 = np.array(y).reshape((-1,1))

In [None]:
model4 = LinearRegression()
model4.fit(X1,y1)

preds4 = model4.predict(X1)
print("Score without Sex column:",r2_score(y1, preds4))
print("Sex column necessarily need not be a factor of Salary.")

Score without Sex column: 0.8485077204335426
Sex column necessarily need not be a factor of Salary.
