# Karl Pearson’s correlation

##  variables must have a Gaussian distribution and a linear relationship

## Covariance

### cov(x,y) = sum((x-mean(x))*(y-mean(y)))/(n-1)

## A problem with covariance as a statistical tool alone is that it is challenging to interpret. This leads us to the Pearson’s correlation coefficient next.

## The Pearson’s correlation coefficient is calculated as the covariance of the two variables divided by the product of the standard deviation of each data sample. It is the normalization of the covariance between the two variables to give an interpretable score.

### Pearson’s correlation coefficient = covariance(x,y)/(stdv(x)*stdv(y))

## The use of mean and standard deviation in the calculation suggests the need for the two data samples to have a Gaussian or Gaussian-like distribution.

# Spearman’s Correlation

## Two variables may be related by a nonlinear relationship, such that the relationship is stronger or weaker across the distribution of the variables

## the two variables being considered may have a non-Gaussian distribution.

## This test of relationship can also be used if there is a linear relationship between the variables, but will have slightly less power (e.g. may result in lower coefficient scores).

## Instead of calculating the coefficient using covariance and standard deviations on the samples themselves, these statistics are calculated from the relative rank of values on each sample. This is a common approach used in non-parametric statistics, e.g. statistical methods where we do not assume a distribution of the data such as Gaussian.



In [6]:
%%time
def findNumber(arr_1, arr_2):
    History_Scores = list(arr_1)
    Physics_Scores = list(arr_2)
    avg_history_score = float(sum(History_Scores))/len(History_Scores)
    avg_physics_score = float(sum(Physics_Scores))/len(Physics_Scores)
    difference_history_score = map(lambda x: x-avg_history_score,History_Scores)
    difference_physics_score = map(lambda x: x-avg_physics_score,Physics_Scores)
    product = map(lambda x,y:x*y,(difference_history_score),(difference_physics_score))
    coeff = sum(product)
    stdv__history_score = (sum([i**2 for i in difference_history_score]))**0.5
    stdv__physics_score = (sum([i**2 for i in difference_physics_score]))**0.5
    score = coeff/(stdv__history_score*stdv__physics_score)
    print("Karl Pearson’s coefficient {score:.3f}".format(score=score))
    return score

CPU times: user 10 µs, sys: 2 µs, total: 12 µs
Wall time: 16.9 µs


## Pearson correlation coefficient and p-value for testing non-correlation.

    The Pearson correlation coefficient [1]_ measures the linear relationship
    between two datasets.  The calculation of the p-value relies on the
    assumption that each dataset is normally distributed.  (See Kowalski [3]_
    for a discussion of the effects of non-normality of the input on the
    distribution of the correlation coefficient.)  Like other correlation
    coefficients, this one varies between -1 and +1 with 0 implying no
    correlation. Correlations of -1 or +1 imply an exact linear relationship.
    Positive correlations imply that as x increases, so does y. Negative
    correlations imply that as x increases, y decreases.

    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Pearson correlation at least as extreme
    as the one computed from these datasets.
    
    
    r = \frac{\sum (x - m_x) (y - m_y)}
                 {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}

In [3]:
%%time
import scipy.stats as s
History_Scores = [10,  25,  17,  11,  13,  17,  20,  13,  9,   15]
Physics_Scores = [15,  12,  8,   8,   7,   7,   7,   6,   5,   3]
print(s.pearsonr(History_Scores,Physics_Scores))

(0.14499815458068518, 0.6894014481166955)
CPU times: user 570 µs, sys: 0 ns, total: 570 µs
Wall time: 500 µs


In [4]:
History_Scores = list(input("Give History Score list here"))
Physics_Scores = list(input("Give Physics Score list here"))
avg_history_score = float(sum(History_Scores))/len(History_Scores)
avg_physics_score = float(sum(Physics_Scores))/len(Physics_Scores)
difference_history_score = map(lambda x: x-avg_history_score,History_Scores)
difference_physics_score = map(lambda x: x-avg_physics_score,Physics_Scores)
product = map(lambda x,y:x*y,(difference_history_score),(difference_physics_score))
coeff = sum(product)
stdv__history_score = (sum([i**2 for i in difference_history_score]))**0.5
stdv__physics_score = (sum([i**2 for i in difference_physics_score]))**0.5
score = coeff/(stdv__history_score*stdv__physics_score)
print("Karl Pearson’s coefficient {score:.3f}".format(score=score))


Give History Score list here[10,  25,  17,  11,  13,  17,  20,  13,  9,   15]
Give Physics Score list here[15,  12,  8,   8,   7,   7,   7,   6,   5,   3]
Karl Pearson’s coefficient 0.145


(0.807027216905996, 1.708185943772348e-05)

In [None]:
# Enter your code here. Read input from STDIN. Print output to STDOUT
def findNumber(arr_1, arr_2):
    History_Scores = list(arr_1)
    Physics_Scores = list(arr_2)
    if len(History_Scores)==len(Physics_Scores) and (len(History_Scores) >2 and len(Physics_Scores) ):
        avg_history_score = float(sum(History_Scores))/len(History_Scores)
        avg_physics_score = float(sum(Physics_Scores))/len(Physics_Scores)
        difference_history_score = list(map(lambda x: x-avg_history_score,History_Scores))
        difference_physics_score = list(map(lambda x: x-avg_physics_score,Physics_Scores))
        product = map(lambda x,y:x*y,(difference_history_score),(difference_physics_score))
        coeff = sum(product)
        stdv__history_score = (sum([i**2 for i in difference_history_score]))**0.5
        stdv__physics_score = (sum([i**2 for i in difference_physics_score]))**0.5
        if stdv__history_score>0 and stdv__physics_score>0:
            score = coeff/(stdv__history_score*stdv__physics_score)
            print(round(score,3))
            return score
        else:
            raise ValueError("Invalid data")
    else:
        raise ValueError("Invalid Data")
if __name__ == '__main__' :
    arr_1 = map(int,input().rstrip().split())
    arr_2 = map(int,input().rstrip().split())
    findNumber(arr_1, arr_2)


In [11]:
lines=[]

while True:
    try:
        lines.append(list(map(int,input().rstrip().split())))
    except:
        break
        
k = lines[0][0]
m = lines[0][1]
        
if 1<=m<=1000 and 1<=k<=7:
    sqlist=[]
    for li in lines:
        if li[0]>7:
            exit()
        del li[0]
        for lij in li:
            if 1<=lij<=10**9:
                pass
            else:
                exit()
    for i in lines:
        print(i)
        sqlist.append(list(map(lambda X:X**2,i)))
    print(sqlist)
    for k in list(product(*sqlist)):
        remainder=sum(list(k))%m
        maxremainnder= remainder if remainder>maxremainnder else maxremainnder
    print(maxremainnder)

3


# Polynomial regression is a special case of linear regression. With the main idea of how do you select your features. Looking at the multivariate regression with 2 variables: x1 and x2. Linear regression will look like this: y = a1 * x1 + a2 * x2.

# Now you want to have a polynomial regression (let's make 2 degree polynomial). We will create a few additional features: x1*x2, x1^2 and x2^2. So we will get your 'linear regression':

# y = a1 * x1 + a2 * x2 + a3 * x1*x2 + a4 * x1^2 + a5 * x2^2

# This nicely shows an important concept curse of dimensionality, because the number of new features grows much faster than linearly with the growth of degree of polynomial.

# In case you are using a multivariate regression and not just a univariate regression, do not forget the cross terms. For instance if you have two variables x1 and x2, and you want polynomials up to power 2, you should use y=a1x1+a2x2+a3x21+a4x22+a5x1x2 where the last term (a5x1x2) is the one I am talking about.

In [None]:

mn = input()
mn = mn.strip().split(' ')

m = int(mn[0])
n = int(mn[1])

X = []
Y = []
for i in range(n):
    x = input()
    x = x.strip().split(' ')
    x = [float(a) for a in x]
    X.append(x[0:m])
    Y.append(x[m:])

q = input()
q = int(q.strip())

X_test = []
for i in range(q):
    x_test = input()
    x_test = x_test.strip().split(' ')
    x_test = [float(a) for a in x_test]
    X_test.append(x_test)


from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

poly = PolynomialFeatures(degree=3)
poly.fit(np.array(X))

regression = LinearRegression()
regression.fit(poly.transform(np.array(X)), Y)

Y_test = regression.predict(poly.transform(np.array(X_test)))
for y_test in Y_test:
    print(round(y_test[0], 2))

In [None]:
def p_c_c(x, y, n):
    sum_xi_yi = sum([x[i] * y[i] for i in range(n)])
    prod = sum(x) * sum(y)
    s_x = (n * sum([x[i] * x[i] for i in range(n)])) - sum(x) ** 2
    s_y = (n * sum([y[i] * y[i] for i in range(n)])) - sum(y) ** 2
    denom = (s_x * s_y) ** 0.5
    nume = (n * sum_xi_yi) - prod
    return round((nume / denom), 2)


number_of_rows = int(input().strip())
math = []
phy = []
che = []
for _ in range(number_of_rows):
    row_i = list(map(float, input().strip().split()))
    math.append(row_i[0])
    phy.append(row_i[1])
    che.append(row_i[2])

print(p_c_c(math, phy, number_of_rows))
print(p_c_c(phy, che, number_of_rows))
print(p_c_c(che, math, number_of_rows))




In [None]:
from sklearn.linear_model import LinearRegression

col_row_count=list(map(int,input().strip().split()))
col_count=col_row_count[0]
row_count=col_row_count[1]
train=[]
target=[]
for i in range(row_count):
    row_i=list(map(float,input().strip().split()))
    train.append(row_i[:col_count])
    target.append(row_i[-1])

test_count=int(input().strip())

test=[]

for i in range(test_count):
    row_i=list(map(float,input().strip().split()))
    test.append(row_i[:col_count])

lm=LinearRegression()
lm.fit(train,target)
print("\n".join(list(map(str,lm.predict(test)))))
